1 Setting up the environment

1.1 Reset workspace and load libraries

rm(list=ls())
gc()
library(tidyverse)
library(qgraph)
library(pander)
library(summarytools)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(tidymodels)
library(knitr)
library(extrafont)
## for poisson class of elastic net
library(poissonreg)
library("sva")
### plotting libraries
library(ggtext)
library(ggpubr)
library(cowplot)
library(ggthemes)
### package for pls analysis (all packages are necessary for the model to run)
library("pls")
library("mixOmics")
library(plsmod)
### scatter plot packages

library(viridis)
library(ggpointdensity)

mixOmics was forcely installed in the environment. Some the select function from dplyr may not work. This is also true for map functions. So dplyr::select and purrr::map are suggested

1.2 Setting up paths

Using ABCD 4.0

source(paste0(scriptfold,"stacking_gfactor_modelling/r_functions.R"))

set up parallel

# parallel for ubuntu
doParallel::registerDoParallel(cores=15)  

## this one works for ubuntu but slow
#library(doFuture)
#registerDoFuture()
#plan(multicore(workers = 30))

### parallel for windows

#library(doFuture)
#registerDoFuture()
#plan(multisession(workers = 30))

2 Load data

2.1 Family relationship

ACS <-read_csv(paste0(dataFold,"ACSPSW03_DATA_TABLE.csv")) 
## Rows: 23101 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): SUBJECTKEY, SRC_SUBJECT_ID, INTERVIEW_DATE, SEX, EVENTNAME, GENETI...
## dbl (18): ACSPSW03_ID, DATASET_ID, INTERVIEW_AGE, RACE_ETHNICITY, REL_FAMILY...
## lgl  (6): GENETIC_PAIRED_SUBJECTID_4, GENETIC_PI_HAT_3, GENETIC_PI_HAT_4, GE...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#knitr::kable(glimpse(ACS))

#race_ethnicity
#1 = White; 2 = Black; 3 = Hispanic; 4 = Asian; 5 = Other

# guardian-report relationship
# Relationship of the participant in his or her family
# 0 = single; 1 = sibling; 2 = twin; 3 = triplet
# ACS %>% count(REL_RELATIONSHIP)

ACSselected <- ACS %>% 
  dplyr::select(all_of(c("SUBJECTKEY", "EVENTNAME", "SEX", "INTERVIEW_AGE", "RACE_ETHNICITY", 
                              "REL_FAMILY_ID", "ACS_RAKED_PROPENSITY_SCORE"))) %>%
  mutate(RACE_ETHNICITY = recode_factor(as.factor(RACE_ETHNICITY),
                `1` = "White", `2` = "Black", `3` = "Hispanic", `4` = "Asian", `5` = "Other",
                .default = "White")) %>%
  mutate(SEX = as.factor(SEX)) %>%
  mutate(REL_FAMILY_ID = as.factor(REL_FAMILY_ID))

ACSselected %>%
 filter(EVENTNAME =="baseline_year_1_arm_1") %>%
 skimr::skim()
Data summary
Name Piped data
Number of rows 11876
Number of columns 7
_______________________
Column type frequency:
character 2
factor 3
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SUBJECTKEY 0 1 12 16 0 11876 0
EVENTNAME 0 1 21 21 0 1 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
SEX 0 1 FALSE 2 M: 6196, F: 5680
RACE_ETHNICITY 2 1 FALSE 5 Whi: 6180, His: 2411, Bla: 1784, Oth: 1247
REL_FAMILY_ID 0 1 FALSE 9854 373: 5, 749: 4, 11: 3, 400: 3

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INTERVIEW_AGE 0 1 118.98 7.50 107.00 112.00 119.00 126.00 133.00 ▇▆▆▆▆
ACS_RAKED_PROPENSITY_SCORE 0 1 691.34 350.98 161.36 449.35 619.31 821.74 1778.92 ▅▇▂▂▁

2.2 site information

###loading site and scanner information
Siteinfo <-tibble::as_tibble(read.csv(paste0(dataFold, "ABCD_LT01_DATA_TABLE.csv")))

Change the wrong site manually based on the: “Release Notes: Adolescent Brain Cognitive Development Study ℠ (ABCD Study ® ) Data Release 4.0 Changes and Known Issues”

only fixed the baseline and two year followup that is used in the analysis

As ID is required not to show in the coding shared online, please refer to this file for the exact subject IDs.

Siteinfo_fixed <- Siteinfo

site_fix <- readRDS(paste0(scriptfold,'Common_psy_gene_brain_all/saved_outputs/site_fix', '.RData'))

for(i in 1:dim(site_fix)[1]){
  fix_site_id  <- site_fix$SUBJECTKEY[i]
  fix_site_event <- site_fix$EVENTNAME[i]
  fix_site <- site_fix$SITE_ID_L[i]
  Siteinfo_fixed$SITE_ID_L[which(Siteinfo_fixed$SUBJECTKEY== fix_site_id& Siteinfo_fixed$EVENTNAME == fix_site_event)] <- fix_site
}

Siteinfo <-Siteinfo_fixed 

2.3 Load vision indicator to find subjects with problem in vision

vision_idx <- as_tibble(read.csv(paste0(dataFold,"ABCD_SVS01_DATA_TABLE.CSV"))) %>% 
  mutate(visionProb = ifelse(SNELLEN_VA_Y == 0 | SNELLEN_VA_Y == 1 | VIS_FLG == 2, 1, 0))

#vision_idx %>% select(SNELLEN_VA_Y, VIS_FLG, visionProb) %>%  arrange(SNELLEN_VA_Y)

2.4 feature names for plotting

plotting_names <- readxl::read_excel(paste0(scriptfold,"Common_psy_gene_brain_all/PsychopathologyPlottingNames_NP.xlsx")) %>%dplyr::select(-"dictionary_name")

plotting_names <- plotting_names %>%
                  rename(feature_names=variable_name)

2.5 mental health

2.5.1 Parent’s report parent’s mental health

Parent Adult Self Report Raw Scores

Adaptive Functioning and Strengths Scales: Education; Friends; Spouse/Partner; Family; Job; Personal Strengths

Syndrome Scales: Anxious/Depressed; Withdrawn; Somatic Complaints; Thought Problems; Attention Problems; Aggressive Behavior; Rule-breaking Behavior; and Intrusive

DSM-5-oriented Scales: Depressive Problems; Anxiety Problems; Somatic Problems; Avoidant Personality Problems; Attention Deficit/Hyperactivity Problems (Inattention and Hyperactivity/Impulsivity subscales); and Antisocial Personality Problems

Substance Use Scales: Tobacco; Alcohol; Drugs

Internalizing (INT), Attention Problems (ATT), Externalizing (EXT), and Total Problems (TOT) scales.

ASRPrecomputed <-read_csv(paste0(dataFold,"ABCD_ASRS01_DATA_TABLE.csv")) 
#glimpse(ASRPrecomputed)
ASRPrecomputed_names<- ASRPrecomputed %>% dplyr::select(ends_with("_R") )%>% colnames()
ASRPrecomputedSelected <- ASRPrecomputed %>% dplyr::select(all_of(c("SUBJECTKEY",
                                                                    "EVENTNAME", ASRPrecomputed_names))) 

ASRPrecomputedSelected %>%
  filter(EVENTNAME =="baseline_year_1_arm_1") %>%
  skimr::skim()
Data summary
Name Piped data
Number of rows 11876
Number of columns 22
_______________________
Column type frequency:
character 2
numeric 20
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SUBJECTKEY 0 1 12 16 0 11876 0
EVENTNAME 0 1 21 21 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ASR_SCR_PERSTR_R 5 1 16.52 3.52 0 15 17 19 22 ▁▁▂▇▇
ASR_SCR_ANXDEP_R 5 1 5.03 4.93 0 1 4 7 35 ▇▂▁▁▁
ASR_SCR_WITHDRAWN_R 5 1 1.56 2.11 0 0 1 2 17 ▇▁▁▁▁
ASR_SCR_SOMATIC_R 5 1 2.94 3.18 0 1 2 4 23 ▇▂▁▁▁
ASR_SCR_THOUGHT_R 5 1 1.43 1.85 0 0 1 2 18 ▇▁▁▁▁
ASR_SCR_ATTENTION_R 5 1 4.63 4.26 0 1 4 7 27 ▇▃▁▁▁
ASR_SCR_AGGRESSIVE_R 5 1 3.35 3.57 0 1 2 5 25 ▇▂▁▁▁
ASR_SCR_RULEBREAK_R 5 1 1.19 1.85 0 0 0 2 18 ▇▁▁▁▁
ASR_SCR_INTRUSIVE_R 5 1 1.00 1.43 0 0 0 2 10 ▇▁▁▁▁
ASR_SCR_INTERNAL_R 5 1 9.53 8.74 0 3 7 13 70 ▇▂▁▁▁
ASR_SCR_EXTERNAL_R 5 1 5.54 5.62 0 1 4 8 39 ▇▂▁▁▁
ASR_SCR_TOTPROB_R 4 1 21.14 17.96 0 8 16 29 154 ▇▂▁▁▁
ASR_SCR_DEPRESS_R 5 1 3.96 3.66 0 1 3 6 28 ▇▂▁▁▁
ASR_SCR_ANXDISORD_R 5 1 3.75 2.56 0 2 3 5 12 ▇▆▆▂▁
ASR_SCR_SOMATICPR_R 5 1 1.91 2.37 0 0 1 3 18 ▇▂▁▁▁
ASR_SCR_AVOIDANT_R 5 1 2.01 2.15 0 0 1 3 14 ▇▃▁▁▁
ASR_SCR_ADHD_R 5 1 3.80 3.64 0 1 3 6 25 ▇▂▁▁▁
ASR_SCR_ANTISOCIAL_R 5 1 2.10 2.49 0 0 1 3 22 ▇▁▁▁▁
ASR_SCR_INATTENTION_R 5 1 2.35 2.35 0 0 2 4 14 ▇▃▁▁▁
ASR_SCR_HYPERACTIVE_R 5 1 1.45 1.73 0 0 1 2 12 ▇▂▁▁▁
ASRPrecomputedSelected %>%
  filter(EVENTNAME =="2_year_follow_up_y_arm_1") %>%
  skimr::skim()
Data summary
Name Piped data
Number of rows 10414
Number of columns 22
_______________________
Column type frequency:
character 2
numeric 20
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SUBJECTKEY 0 1 12 16 0 10414 0
EVENTNAME 0 1 24 24 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ASR_SCR_PERSTR_R 57 0.99 16.10 3.70 0 14 17 19 22 ▁▁▃▇▇
ASR_SCR_ANXDEP_R 57 0.99 5.04 4.96 0 1 4 7 34 ▇▂▁▁▁
ASR_SCR_WITHDRAWN_R 57 0.99 1.57 2.07 0 0 1 2 18 ▇▁▁▁▁
ASR_SCR_SOMATIC_R 57 0.99 2.95 3.17 0 1 2 4 22 ▇▂▁▁▁
ASR_SCR_THOUGHT_R 57 0.99 1.28 1.71 0 0 1 2 17 ▇▁▁▁▁
ASR_SCR_ATTENTION_R 57 0.99 4.59 4.22 0 1 3 7 29 ▇▃▁▁▁
ASR_SCR_AGGRESSIVE_R 57 0.99 3.13 3.38 0 1 2 5 26 ▇▂▁▁▁
ASR_SCR_RULEBREAK_R 57 0.99 1.06 1.73 0 0 0 2 21 ▇▁▁▁▁
ASR_SCR_INTRUSIVE_R 57 0.99 0.95 1.39 0 0 0 1 11 ▇▁▁▁▁
ASR_SCR_INTERNAL_R 57 0.99 9.55 8.73 0 3 7 13 65 ▇▂▁▁▁
ASR_SCR_EXTERNAL_R 57 0.99 5.15 5.30 0 1 4 7 57 ▇▁▁▁▁
ASR_SCR_TOTPROB_R 57 0.99 20.57 17.48 0 8 16 28 133 ▇▂▁▁▁
ASR_SCR_DEPRESS_R 57 0.99 4.00 3.67 0 1 3 6 25 ▇▂▁▁▁
ASR_SCR_ANXDISORD_R 57 0.99 3.64 2.53 0 2 3 5 12 ▇▆▆▁▁
ASR_SCR_SOMATICPR_R 57 0.99 1.92 2.36 0 0 1 3 17 ▇▂▁▁▁
ASR_SCR_AVOIDANT_R 57 0.99 1.98 2.11 0 0 1 3 13 ▇▃▁▁▁
ASR_SCR_ADHD_R 57 0.99 3.68 3.54 0 1 3 5 23 ▇▃▁▁▁
ASR_SCR_ANTISOCIAL_R 57 0.99 2.00 2.35 0 0 1 3 30 ▇▁▁▁▁
ASR_SCR_INATTENTION_R 57 0.99 2.35 2.33 0 0 2 4 14 ▇▃▁▁▁
ASR_SCR_HYPERACTIVE_R 57 0.99 1.33 1.66 0 0 1 2 12 ▇▁▁▁▁
ASRPrecomputedSelected %>%
  filter(EVENTNAME =="3_year_follow_up_y_arm_1") %>%
  skimr::skim()
Data summary
Name Piped data
Number of rows 6251
Number of columns 22
_______________________
Column type frequency:
character 2
numeric 20
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SUBJECTKEY 0 1 12 16 0 6251 0
EVENTNAME 0 1 24 24 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ASR_SCR_PERSTR_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_ANXDEP_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_WITHDRAWN_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_SOMATIC_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_THOUGHT_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_ATTENTION_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_AGGRESSIVE_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_RULEBREAK_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_INTRUSIVE_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_INTERNAL_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_EXTERNAL_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_TOTPROB_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_DEPRESS_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_ANXDISORD_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_SOMATICPR_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_AVOIDANT_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_ADHD_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_ANTISOCIAL_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_INATTENTION_R 6251 0 NaN NA NA NA NA NA NA
ASR_SCR_HYPERACTIVE_R 6251 0 NaN NA NA NA NA NA NA

2.6 Personality

2.6.1 Load BIS/BAS

BISBAS <-as_tibble(read.csv(paste0(dataFold,"ABCD_BISBAS01_DATA_TABLE.csv"))) %>% 
# filter(EVENTNAME =="baseline_year_1_arm_1")   %>%
  mutate(BISAvg=rowMeans(cbind(BISBAS2_Y,BISBAS3_Y,BISBAS4_Y,BISBAS6_Y), na.rm=F)) %>%
  mutate(BASRRAvg=rowMeans(cbind(BISBAS8_Y,BISBAS10_Y,BISBAS11_Y,BISBAS12_Y), na.rm=F)) %>%
  mutate(BASDriveAvg=rowMeans(cbind(BISBAS13_Y,BISBAS14_Y,BISBAS15_Y,BISBAS16_Y), na.rm=F)) %>%
  mutate(BASFunAvg=rowMeans(cbind(BISBAS17_Y,BISBAS18_Y,BISBAS19_Y,BISBAS20_Y), na.rm=F)) %>% 
  mutate(BASAllAvg=rowMeans(cbind(BASRRAvg,BASDriveAvg,BASFunAvg), na.rm=F)) %>%
  dplyr::select(all_of(c('SUBJECTKEY','EVENTNAME','BISAvg','BASRRAvg','BASDriveAvg','BASFunAvg','BASAllAvg')))

2.6.2 histrogram of BIS/BAS

BISBAS %>% filter(EVENTNAME =="baseline_year_1_arm_1")   %>%
ggplot(aes(x=BISAvg), bins = 100) + 
  geom_histogram(color="black", fill="white") +
  xlab("averaged BIS") +
  theme_classic(base_size = 30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (`stat_bin()`).

BISBAS %>% filter(EVENTNAME =="2_year_follow_up_y_arm_1" )   %>%
ggplot(aes(x=BISAvg), bins = 100) + 
  geom_histogram(color="black", fill="white") +
  xlab("averaged BIS") +
  theme_classic(base_size = 30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 30 rows containing non-finite values (`stat_bin()`).

BISBAS %>% filter(EVENTNAME =="baseline_year_1_arm_1")   %>%
ggplot(aes(x=BASAllAvg)) + 
  geom_histogram(color="black", fill="white", bins = 100) +
  xlab("averaged BAS") +
  theme_classic(base_size = 30)
## Warning: Removed 23 rows containing non-finite values (`stat_bin()`).

BISBAS %>% filter(EVENTNAME =="2_year_follow_up_y_arm_1")   %>%
ggplot(aes(x=BASAllAvg)) + 
  geom_histogram(color="black", fill="white", bins = 100) +
  xlab("averaged BAS") +
  theme_classic(base_size = 30)
## Warning: Removed 30 rows containing non-finite values (`stat_bin()`).

2.6.3 other mental health surveys from youth

look like only ABCD Prodromal Psychosis Scale (PPS) and UPSS (beside BIS/BAS) are avaliable for the baseline.
Though 4595 data from PPS is missing. So we will not use PPS and only focus on UPPS.

#ABCD Sum Scores Mental Health Youth
mentalHealthYouth <-as_tibble(read.csv(paste0(dataFold,"ABCD_MHY02_DATA_TABLE.csv"))) 

mentalHealthYouth %>% filter(EVENTNAME =="baseline_year_1_arm_1") %>% dplyr::select(-1:-8)  %>%
 skimr::skim()
Data summary
Name Piped data
Number of rows 11876
Number of columns 100
_______________________
Column type frequency:
logical 13
numeric 87
________________________
Group variables None

Variable type: logical

skim_variable n_missing complete_rate mean count
PLE_Y_SS_TOTAL_GOOD_NT 11876 0 NaN :
PLE_Y_SS_TOTAL_BAD_NT 11876 0 NaN :
PLE_Y_SS_AFFECT_SUM_NT 11876 0 NaN :
PLE_Y_SS_AFFECTED_MEAN_NT 11876 0 NaN :
PLE_Y_SS_AFFECTED_GOOD_SUM_NT 11876 0 NaN :
PLE_Y_SS_AFFECTED_GOOD_MEAN_NT 11876 0 NaN :
PLE_Y_SS_AFFECTED_BAD_SUM_NT 11876 0 NaN :
PLE_Y_SS_AFFECTED_BAD_MEAN_NT 11876 0 NaN :
DELQ_Y_SS_SUM 11876 0 NaN :
DELQ_Y_SS_SUM_NM 11876 0 NaN :
DELQ_Y_SS_SUM_NT 11876 0 NaN :
ERQ_SS_REAPPRAISAL_PR 11876 0 NaN :
ERQ_SS_SUPPRESS_PR 11876 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PLE_Y_SS_TOTAL_NUMBER 11876 0.00 NaN NA NA NA NA NA NA
PLE_Y_SS_TOTAL_NUMBER_NM 11876 0.00 NaN NA NA NA NA NA NA
PLE_Y_SS_TOTAL_NUMBER_NT 11876 0.00 NaN NA NA NA NA NA NA
PLE_Y_SS_TOTAL_GOOD 0 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
PLE_Y_SS_TOTAL_BAD 0 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
PLE_Y_SS_AFFECT_SUM 11876 0.00 NaN NA NA NA NA NA NA
PLE_Y_SS_AFFECTED_GOOD_SUM 0 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
PLE_Y_SS_AFFECTED_GOOD_MEAN 11876 0.00 NaN NA NA NA NA NA NA
PLE_Y_SS_AFFECTED_BAD_SUM 0 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
PLE_Y_SS_AFFECTED_BAD_MEAN 11876 0.00 NaN NA NA NA NA NA NA
PPS_Y_SS_NUMBER 11 1.00 2.63 3.56 0 0 1 4 21 ▇▂▁▁▁
PPS_Y_SS_NUMBER_NM 0 1.00 0.02 0.69 0 0 0 0 21 ▇▁▁▁▁
PPS_Y_SS_NUMBER_NT 0 1.00 21.00 0.00 21 21 21 21 21 ▁▁▇▁▁
PPS_Y_SS_BOTHER_SUM 4594 0.61 2.14 2.62 0 0 1 3 19 ▇▂▁▁▁
PPS_Y_SS_BOTHER_SUM_NM 0 1.00 18.37 3.56 0 17 20 21 21 ▁▁▁▂▇
PPS_Y_SS_BOTHER_SUM_NT 0 1.00 21.00 0.00 21 21 21 21 21 ▁▁▇▁▁
PPS_Y_SS_BOTHER_N_1 4594 0.61 2.15 2.07 0 1 2 3 17 ▇▂▁▁▁
PPS_Y_SS_BOTHER_N_1_NM 0 1.00 18.37 3.56 0 17 20 21 21 ▁▁▁▂▇
PPS_Y_SS_BOTHER_N_1_NT 0 1.00 21.00 0.00 21 21 21 21 21 ▁▁▇▁▁
PPS_Y_SS_SEVERITY_SCORE 10 1.00 6.32 10.61 0 0 2 8 104 ▇▁▁▁▁
PPS_Y_SS_SEVERITY_SCORE_NM 0 1.00 38.05 5.67 3 36 40 42 42 ▁▁▁▁▇
PPS_Y_SS_SEVERITY_SCORE_NT 0 1.00 42.00 0.00 42 42 42 42 42 ▁▁▇▁▁
PPS_SS_MEAN_SEVERITY 4595 0.61 2.15 1.10 1 1 2 3 6 ▇▃▂▁▁
UPPS_Y_SS_NEGATIVE_URGENCY 23 1.00 8.49 2.65 4 7 8 10 16 ▆▇▇▂▁
UPPS_Y_SS_NEGATIVE_URGENCY_NM 0 1.00 0.01 0.17 0 0 0 0 4 ▇▁▁▁▁
UPPS_Y_SS_NEGATIVE_URGENCY_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
UPPS_Y_SS_LACK_OF_PLANNING 23 1.00 7.74 2.38 4 6 8 9 16 ▆▇▅▁▁
UPPS_Y_SS_LACK_OF_PLANNING_NM 0 1.00 0.01 0.18 0 0 0 0 4 ▇▁▁▁▁
UPPS_Y_SS_LACK_OF_PLANNING_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
UPPS_Y_SS_SENSATION_SEEKING 23 1.00 9.77 2.68 4 8 10 12 16 ▂▅▇▃▂
UPPS_Y_SS_SENSATION_SEEKING_NM 0 1.00 0.01 0.18 0 0 0 0 4 ▇▁▁▁▁
UPPS_Y_SS_SENSATION_SEEKING_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
UPPS_Y_SS_POSITIVE_URGENCY 23 1.00 7.99 2.96 4 6 8 10 16 ▇▆▆▂▁
UPPS_Y_SS_POSITIVE_URGENCY_NM 0 1.00 0.01 0.18 0 0 0 0 4 ▇▁▁▁▁
UPPS_Y_SS_POSITIVE_URGENCY_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
UPPS_Y_SS_LACK_OF_PERSEVERANCE 23 1.00 7.04 2.25 4 5 7 8 16 ▇▆▃▁▁
UPPS_Y_SS_LACK_OF_PERS_NM 0 1.00 0.01 0.18 0 0 0 0 4 ▇▁▁▁▁
UPPS_Y_SS_LACK_OF_PERS_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
BIS_Y_SS_BIS_SUM 22 1.00 9.51 3.75 0 7 9 12 21 ▂▇▇▃▁
BIS_Y_SS_BIS_SUM_NM 22 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
BIS_Y_SS_BIS_SUM_NT 22 1.00 7.00 0.00 7 7 7 7 7 ▁▁▇▁▁
BIS_Y_SS_BAS_RR 23 1.00 11.00 2.92 0 9 11 13 15 ▁▂▅▇▇
BIS_Y_SS_BAS_RR_NM 23 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
BIS_Y_SS_BAS_RR_NT 23 1.00 5.00 0.00 5 5 5 5 5 ▁▁▇▁▁
BIS_Y_SS_BAS_DRIVE 23 1.00 4.14 3.06 0 2 4 6 12 ▇▆▅▂▂
BIS_Y_SS_BAS_DRIVE_NM 23 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
BIS_Y_SS_BAS_DRIVE_NT 23 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
BIS_Y_SS_BAS_FS 23 1.00 5.71 2.64 0 4 6 7 12 ▂▅▇▃▂
BIS_Y_SS_BAS_FS_NM 23 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
BIS_Y_SS_BAS_FS_NT 23 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
BIS_Y_SS_BISM_SUM 22 1.00 5.53 2.84 0 3 5 8 12 ▃▅▇▃▂
BIS_Y_SS_BISM_SUM_NM 22 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
BIS_Y_SS_BISM_SUM_NT 22 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
BIS_Y_SS_BASM_RR 23 1.00 8.82 2.39 0 7 9 11 12 ▁▁▅▆▇
BIS_Y_SS_BASM_RR_NM 23 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
BIS_Y_SS_BASM_RR_NT 23 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
BIS_Y_SS_BASM_DRIVE 23 1.00 4.14 3.06 0 2 4 6 12 ▇▆▅▂▂
BIS_Y_SS_BASM_DRIVE_NM 0 1.00 0.01 0.18 0 0 0 0 4 ▇▁▁▁▁
BIS_Y_SS_BASM_DRIVE_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
SUP_Y_SS_SUM 11876 0.00 NaN NA NA NA NA NA NA
SUP_Y_SS_SUM_NM 1494 0.87 7.00 0.00 7 7 7 7 7 ▁▁▇▁▁
SUP_Y_SS_SUM_NT 1494 0.87 7.00 0.00 7 7 7 7 7 ▁▁▇▁▁
GISH_Y_SS_M_SUM 11876 0.00 NaN NA NA NA NA NA NA
GISH_Y_SS_M_SUM_NM 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
GISH_Y_SS_M_SUM_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
GISH_Y_SS_F_SUM 11876 0.00 NaN NA NA NA NA NA NA
GISH_Y_SS_F_SUM_NM 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
GISH_Y_SS_F_SUM_NT 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
PEQ_SS_RELATIONAL_AGGS_NM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_RELATIONAL_AGGS_NT 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_RELATIONAL_VICTIM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_RELATIONAL_VICTIM_NM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_RELATIONAL_VICTIM_NT 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_REPUTATION_AGGS 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_REPUTATION_AGGS_NM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_REPUTATION_AGGS_NT 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_REPUTATION_VICTIM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_REPUTATION_VICTIM_NM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_REPUTATION_VICTIM_NT 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_OVERT_AGGRESSION 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_OVERT_AGGRESSION_NM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_OVERT_AGGRESSION_NT 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_OVERT_VICTIM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_OVERT_VICTIM_NM 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_OVERT_VICTIM_NT 11876 0.00 NaN NA NA NA NA NA NA
PEQ_SS_RELATIONAL_AGGS 11876 0.00 NaN NA NA NA NA NA NA
PSTR_SS_PR 11876 0.00 NaN NA NA NA NA NA NA
mentalHealthYouth %>% filter(EVENTNAME =="2_year_follow_up_y_arm_1") %>% dplyr::select(-1:-8)  %>%
 skimr::skim()
Data summary
Name Piped data
Number of rows 10414
Number of columns 100
_______________________
Column type frequency:
logical 13
numeric 87
________________________
Group variables None

Variable type: logical

skim_variable n_missing complete_rate mean count
PLE_Y_SS_TOTAL_GOOD_NT 10414 0 NaN :
PLE_Y_SS_TOTAL_BAD_NT 10414 0 NaN :
PLE_Y_SS_AFFECT_SUM_NT 10414 0 NaN :
PLE_Y_SS_AFFECTED_MEAN_NT 10414 0 NaN :
PLE_Y_SS_AFFECTED_GOOD_SUM_NT 10414 0 NaN :
PLE_Y_SS_AFFECTED_GOOD_MEAN_NT 10414 0 NaN :
PLE_Y_SS_AFFECTED_BAD_SUM_NT 10414 0 NaN :
PLE_Y_SS_AFFECTED_BAD_MEAN_NT 10414 0 NaN :
DELQ_Y_SS_SUM 10414 0 NaN :
DELQ_Y_SS_SUM_NM 10414 0 NaN :
DELQ_Y_SS_SUM_NT 10414 0 NaN :
ERQ_SS_REAPPRAISAL_PR 10414 0 NaN :
ERQ_SS_SUPPRESS_PR 10414 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PLE_Y_SS_TOTAL_NUMBER 22 1.00 5.26 3.25 0 3.00 5.00 7.0 23 ▇▇▂▁▁
PLE_Y_SS_TOTAL_NUMBER_NM 0 1.00 0.05 1.15 0 0.00 0.00 0.0 25 ▇▁▁▁▁
PLE_Y_SS_TOTAL_NUMBER_NT 0 1.00 25.00 0.00 25 25.00 25.00 25.0 25 ▁▁▇▁▁
PLE_Y_SS_TOTAL_GOOD 0 1.00 1.55 1.33 0 0.00 1.00 2.0 8 ▇▆▁▁▁
PLE_Y_SS_TOTAL_BAD 0 1.00 2.39 2.25 0 1.00 2.00 3.0 19 ▇▂▁▁▁
PLE_Y_SS_AFFECT_SUM 268 0.97 8.77 6.90 0 4.00 7.00 12.0 61 ▇▂▁▁▁
PLE_Y_SS_AFFECTED_GOOD_SUM 0 1.00 2.43 2.85 0 0.00 2.00 4.0 18 ▇▂▁▁▁
PLE_Y_SS_AFFECTED_GOOD_MEAN 2668 0.74 1.57 1.08 0 0.67 1.67 2.5 3 ▇▅▃▅▇
PLE_Y_SS_AFFECTED_BAD_SUM 0 1.00 4.84 5.35 0 1.00 3.00 7.0 51 ▇▁▁▁▁
PLE_Y_SS_AFFECTED_BAD_MEAN 1902 0.82 1.94 0.77 0 1.50 2.00 2.5 3 ▁▅▃▇▇
PPS_Y_SS_NUMBER 26 1.00 1.56 2.79 0 0.00 0.00 2.0 21 ▇▁▁▁▁
PPS_Y_SS_NUMBER_NM 0 1.00 0.05 1.05 0 0.00 0.00 0.0 21 ▇▁▁▁▁
PPS_Y_SS_NUMBER_NT 0 1.00 21.00 0.00 21 21.00 21.00 21.0 21 ▁▁▇▁▁
PPS_Y_SS_BOTHER_SUM 5929 0.43 1.68 2.19 0 0.00 1.00 2.0 17 ▇▁▁▁▁
PPS_Y_SS_BOTHER_SUM_NM 0 1.00 19.44 2.79 0 19.00 21.00 21.0 21 ▁▁▁▁▇
PPS_Y_SS_BOTHER_SUM_NT 0 1.00 21.00 0.00 21 21.00 21.00 21.0 21 ▁▁▇▁▁
PPS_Y_SS_BOTHER_N_1 5929 0.43 1.94 2.00 0 1.00 1.00 3.0 18 ▇▂▁▁▁
PPS_Y_SS_BOTHER_N_1_NM 0 1.00 19.44 2.79 0 19.00 21.00 21.0 21 ▁▁▁▁▇
PPS_Y_SS_BOTHER_N_1_NT 0 1.00 21.00 0.00 21 21.00 21.00 21.0 21 ▁▁▇▁▁
PPS_Y_SS_SEVERITY_SCORE 26 1.00 3.56 7.72 0 0.00 0.00 4.0 89 ▇▁▁▁▁
PPS_Y_SS_SEVERITY_SCORE_NM 0 1.00 39.72 4.29 5 39.00 42.00 42.0 42 ▁▁▁▁▇
PPS_Y_SS_SEVERITY_SCORE_NT 0 1.00 42.00 0.00 42 42.00 42.00 42.0 42 ▁▁▇▁▁
PPS_SS_MEAN_SEVERITY 5929 0.43 2.06 1.09 0 1.00 2.00 3.0 6 ▇▇▆▂▁
UPPS_Y_SS_NEGATIVE_URGENCY 30 1.00 7.76 2.34 4 6.00 8.00 9.0 16 ▇▇▇▁▁
UPPS_Y_SS_NEGATIVE_URGENCY_NM 0 1.00 0.01 0.21 0 0.00 0.00 0.0 4 ▇▁▁▁▁
UPPS_Y_SS_NEGATIVE_URGENCY_NT 0 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
UPPS_Y_SS_LACK_OF_PLANNING 30 1.00 7.77 2.24 4 6.00 8.00 9.0 16 ▅▇▃▁▁
UPPS_Y_SS_LACK_OF_PLANNING_NM 0 1.00 0.01 0.21 0 0.00 0.00 0.0 4 ▇▁▁▁▁
UPPS_Y_SS_LACK_OF_PLANNING_NT 0 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
UPPS_Y_SS_SENSATION_SEEKING 29 1.00 9.48 2.68 3 8.00 9.00 11.0 16 ▂▇▇▇▂
UPPS_Y_SS_SENSATION_SEEKING_NM 0 1.00 0.01 0.21 0 0.00 0.00 0.0 4 ▇▁▁▁▁
UPPS_Y_SS_SENSATION_SEEKING_NT 0 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
UPPS_Y_SS_POSITIVE_URGENCY 30 1.00 7.39 2.67 4 5.00 8.00 9.0 16 ▇▆▅▁▁
UPPS_Y_SS_POSITIVE_URGENCY_NM 0 1.00 0.01 0.21 0 0.00 0.00 0.0 4 ▇▁▁▁▁
UPPS_Y_SS_POSITIVE_URGENCY_NT 0 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
UPPS_Y_SS_LACK_OF_PERSEVERANCE 30 1.00 6.97 2.25 4 5.00 7.00 8.0 16 ▇▇▃▁▁
UPPS_Y_SS_LACK_OF_PERS_NM 0 1.00 0.01 0.21 0 0.00 0.00 0.0 4 ▇▁▁▁▁
UPPS_Y_SS_LACK_OF_PERS_NT 0 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
BIS_Y_SS_BIS_SUM 30 1.00 8.70 3.94 0 6.00 8.00 11.0 21 ▃▇▆▃▁
BIS_Y_SS_BIS_SUM_NM 30 1.00 0.00 0.00 0 0.00 0.00 0.0 0 ▁▁▇▁▁
BIS_Y_SS_BIS_SUM_NT 30 1.00 7.00 0.00 7 7.00 7.00 7.0 7 ▁▁▇▁▁
BIS_Y_SS_BAS_RR 30 1.00 9.92 3.11 0 8.00 10.00 12.0 15 ▁▃▇▇▆
BIS_Y_SS_BAS_RR_NM 30 1.00 0.00 0.00 0 0.00 0.00 0.0 0 ▁▁▇▁▁
BIS_Y_SS_BAS_RR_NT 30 1.00 5.00 0.00 5 5.00 5.00 5.0 5 ▁▁▇▁▁
BIS_Y_SS_BAS_DRIVE 30 1.00 3.68 2.79 0 2.00 3.00 5.0 12 ▇▆▅▁▁
BIS_Y_SS_BAS_DRIVE_NM 30 1.00 0.00 0.00 0 0.00 0.00 0.0 0 ▁▁▇▁▁
BIS_Y_SS_BAS_DRIVE_NT 30 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
BIS_Y_SS_BAS_FS 30 1.00 4.55 2.59 0 3.00 4.00 6.0 12 ▆▇▇▂▁
BIS_Y_SS_BAS_FS_NM 30 1.00 0.00 0.00 0 0.00 0.00 0.0 0 ▁▁▇▁▁
BIS_Y_SS_BAS_FS_NT 30 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
BIS_Y_SS_BISM_SUM 30 1.00 4.96 2.82 0 3.00 5.00 7.0 12 ▅▆▇▃▂
BIS_Y_SS_BISM_SUM_NM 30 1.00 0.00 0.00 0 0.00 0.00 0.0 0 ▁▁▇▁▁
BIS_Y_SS_BISM_SUM_NT 30 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
BIS_Y_SS_BASM_RR 30 1.00 7.96 2.51 0 6.00 8.00 10.0 12 ▁▂▇▇▇
BIS_Y_SS_BASM_RR_NM 30 1.00 0.00 0.00 0 0.00 0.00 0.0 0 ▁▁▇▁▁
BIS_Y_SS_BASM_RR_NT 30 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
BIS_Y_SS_BASM_DRIVE 30 1.00 3.68 2.79 0 2.00 3.00 5.0 12 ▇▆▅▁▁
BIS_Y_SS_BASM_DRIVE_NM 0 1.00 0.01 0.21 0 0.00 0.00 0.0 4 ▇▁▁▁▁
BIS_Y_SS_BASM_DRIVE_NT 0 1.00 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
SUP_Y_SS_SUM 10414 0.00 NaN NA NA NA NA NA NA
SUP_Y_SS_SUM_NM 10414 0.00 NaN NA NA NA NA NA NA
SUP_Y_SS_SUM_NT 10414 0.00 NaN NA NA NA NA NA NA
GISH_Y_SS_M_SUM 5066 0.51 19.63 1.06 4 20.00 20.00 20.0 20 ▁▁▁▁▇
GISH_Y_SS_M_SUM_NM 2844 0.73 1.15 1.80 0 0.00 0.00 4.0 4 ▇▁▁▁▃
GISH_Y_SS_M_SUM_NT 2844 0.73 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
GISH_Y_SS_F_SUM 5551 0.47 18.67 2.31 4 18.00 20.00 20.0 20 ▁▁▁▁▇
GISH_Y_SS_F_SUM_NM 3157 0.70 1.29 1.86 0 0.00 0.00 4.0 4 ▇▁▁▁▃
GISH_Y_SS_F_SUM_NT 3157 0.70 4.00 0.00 4 4.00 4.00 4.0 4 ▁▁▇▁▁
PEQ_SS_RELATIONAL_AGGS_NM 0 1.00 0.01 0.14 0 0.00 0.00 0.0 3 ▇▁▁▁▁
PEQ_SS_RELATIONAL_AGGS_NT 0 1.00 3.00 0.00 3 3.00 3.00 3.0 3 ▁▁▇▁▁
PEQ_SS_RELATIONAL_VICTIM 22 1.00 4.74 1.93 3 3.00 4.00 6.0 15 ▇▂▁▁▁
PEQ_SS_RELATIONAL_VICTIM_NM 0 1.00 0.01 0.14 0 0.00 0.00 0.0 3 ▇▁▁▁▁
PEQ_SS_RELATIONAL_VICTIM_NT 0 1.00 3.00 0.00 3 3.00 3.00 3.0 3 ▁▁▇▁▁
PEQ_SS_REPUTATION_AGGS 22 1.00 3.17 0.67 3 3.00 3.00 3.0 15 ▇▁▁▁▁
PEQ_SS_REPUTATION_AGGS_NM 0 1.00 0.01 0.14 0 0.00 0.00 0.0 3 ▇▁▁▁▁
PEQ_SS_REPUTATION_AGGS_NT 0 1.00 3.00 0.00 3 3.00 3.00 3.0 3 ▁▁▇▁▁
PEQ_SS_REPUTATION_VICTIM 22 1.00 3.97 1.80 3 3.00 3.00 4.0 15 ▇▁▁▁▁
PEQ_SS_REPUTATION_VICTIM_NM 0 1.00 0.01 0.14 0 0.00 0.00 0.0 3 ▇▁▁▁▁
PEQ_SS_REPUTATION_VICTIM_NT 0 1.00 3.00 0.00 3 3.00 3.00 3.0 3 ▁▁▇▁▁
PEQ_SS_OVERT_AGGRESSION 22 1.00 3.31 0.93 3 3.00 3.00 3.0 15 ▇▁▁▁▁
PEQ_SS_OVERT_AGGRESSION_NM 0 1.00 0.01 0.14 0 0.00 0.00 0.0 3 ▇▁▁▁▁
PEQ_SS_OVERT_AGGRESSION_NT 0 1.00 3.00 0.00 3 3.00 3.00 3.0 3 ▁▁▇▁▁
PEQ_SS_OVERT_VICTIM 22 1.00 3.67 1.38 3 3.00 3.00 4.0 15 ▇▁▁▁▁
PEQ_SS_OVERT_VICTIM_NM 0 1.00 0.01 0.14 0 0.00 0.00 0.0 3 ▇▁▁▁▁
PEQ_SS_OVERT_VICTIM_NT 0 1.00 3.00 0.00 3 3.00 3.00 3.0 3 ▁▁▇▁▁
PEQ_SS_RELATIONAL_AGGS 22 1.00 3.87 1.27 3 3.00 3.00 5.0 14 ▇▁▁▁▁
PSTR_SS_PR 10414 0.00 NaN NA NA NA NA NA NA
UPPS <- mentalHealthYouth %>% dplyr::select(SUBJECTKEY,EVENTNAME, 
                             UPPS_Y_SS_NEGATIVE_URGENCY, UPPS_Y_SS_LACK_OF_PLANNING,
                             UPPS_Y_SS_SENSATION_SEEKING, UPPS_Y_SS_POSITIVE_URGENCY, UPPS_Y_SS_LACK_OF_PERSEVERANCE)

2.7 loading cbcl raw score

#ABCD Sum Scores Mental Health Youth
CBCL_sum_scores <-as_tibble(read.csv(paste0(dataFold,"ABCD_CBCLS01_DATA_TABLE.csv"))) 

CBCL_sum_scores %>% filter(EVENTNAME =="baseline_year_1_arm_1") %>% dplyr::select(-1:-8)  %>%
 skimr::skim()
Data summary
Name Piped data
Number of rows 11876
Number of columns 81
_______________________
Column type frequency:
character 20
logical 1
numeric 60
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
CBCL_SCR_SYN_ANXDEP_M 0 1 0 143 11865 4 0
CBCL_SCR_SYN_WITHDEP_M 0 1 0 90 11865 4 0
CBCL_SCR_SYN_SOMATIC_M 0 1 0 127 11865 3 0
CBCL_SCR_SYN_SOCIAL_M 0 1 0 120 11865 3 0
CBCL_SCR_SYN_THOUGHT_M 0 1 0 165 11865 4 0
CBCL_SCR_SYN_ATTENTION_M 0 1 0 109 11865 5 0
CBCL_SCR_SYN_RULEBREAK_M 0 1 0 189 11865 5 0
CBCL_SCR_SYN_AGGRESSIVE_M 0 1 0 198 11864 5 0
CBCL_SCR_SYN_INTERNAL_M 0 1 0 362 11865 5 0
CBCL_SCR_SYN_EXTERNAL_M 0 1 0 388 11864 6 0
CBCL_SCR_SYN_TOTPROB_M 0 1 0 1329 11864 7 0
CBCL_SCR_DSM5_DEPRESS_M 0 1 0 145 11865 4 0
CBCL_SCR_DSM5_ANXDISORD_M 0 1 0 99 11865 4 0
CBCL_SCR_DSM5_SOMATICPR_M 0 1 0 83 11865 3 0
CBCL_SCR_DSM5_ADHD_M 0 1 0 77 11865 5 0
CBCL_SCR_DSM5_OPPOSIT_M 0 1 0 54 11865 4 0
CBCL_SCR_DSM5_CONDUCT_M 0 1 0 188 11864 5 0
CBCL_SCR_07_SCT_M 0 1 0 44 11865 3 0
CBCL_SCR_07_OCD_M 0 1 0 88 11865 3 0
CBCL_SCR_07_STRESS_M 0 1 0 155 11865 4 0

Variable type: logical

skim_variable n_missing complete_rate mean count
CBCL_SCR_07_STRESS_NM_2 11876 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CBCL_SCR_SYN_ANXDEP_R 8 1 2.52 3.06 0 0 1 4 26 ▇▁▁▁▁
CBCL_SCR_SYN_ANXDEP_T 8 1 53.48 5.96 50 50 50 54 100 ▇▁▁▁▁
CBCL_SCR_SYN_ANXDEP_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_WITHDEP_R 8 1 1.03 1.71 0 0 0 1 15 ▇▁▁▁▁
CBCL_SCR_SYN_WITHDEP_T 8 1 53.51 5.79 50 50 50 54 97 ▇▁▁▁▁
CBCL_SCR_SYN_WITHDEP_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_SOMATIC_R 8 1 1.49 1.95 0 0 1 2 16 ▇▁▁▁▁
CBCL_SCR_SYN_SOMATIC_T 8 1 54.88 6.05 50 50 53 57 88 ▇▂▁▁▁
CBCL_SCR_SYN_SOMATIC_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_SOCIAL_R 8 1 1.62 2.28 0 0 1 2 18 ▇▁▁▁▁
CBCL_SCR_SYN_SOCIAL_T 8 1 52.80 4.74 50 50 51 53 90 ▇▁▁▁▁
CBCL_SCR_SYN_SOCIAL_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_THOUGHT_R 8 1 1.62 2.19 0 0 1 2 18 ▇▁▁▁▁
CBCL_SCR_SYN_THOUGHT_T 8 1 53.79 5.90 50 50 51 54 84 ▇▁▁▁▁
CBCL_SCR_SYN_THOUGHT_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_ATTENTION_R 8 1 2.98 3.49 0 0 2 5 20 ▇▂▁▁▁
CBCL_SCR_SYN_ATTENTION_T 8 1 53.90 6.17 50 50 51 55 100 ▇▁▁▁▁
CBCL_SCR_SYN_ATTENTION_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_RULEBREAK_R 8 1 1.19 1.86 0 0 0 2 20 ▇▁▁▁▁
CBCL_SCR_SYN_RULEBREAK_T 8 1 52.78 4.90 50 50 50 53 84 ▇▁▁▁▁
CBCL_SCR_SYN_RULEBREAK_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_AGGRESSIVE_R 8 1 3.26 4.35 0 0 2 5 36 ▇▁▁▁▁
CBCL_SCR_SYN_AGGRESSIVE_T 8 1 52.83 5.53 50 50 50 53 100 ▇▁▁▁▁
CBCL_SCR_SYN_AGGRESSIVE_NM 8 1 0.00 0.01 0 0 0 0 1 ▇▁▁▁▁
CBCL_SCR_SYN_INTERNAL_R 8 1 5.05 5.53 0 1 3 7 51 ▇▁▁▁▁
CBCL_SCR_SYN_INTERNAL_T 8 1 48.45 10.64 33 41 48 56 93 ▇▇▃▁▁
CBCL_SCR_SYN_INTERNAL_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_EXTERNAL_R 8 1 4.45 5.86 0 0 2 6 49 ▇▁▁▁▁
CBCL_SCR_SYN_EXTERNAL_T 8 1 45.72 10.33 33 34 44 53 84 ▇▇▃▁▁
CBCL_SCR_SYN_EXTERNAL_NM 8 1 0.00 0.01 0 0 0 0 1 ▇▁▁▁▁
CBCL_SCR_SYN_TOTPROB_R 8 1 18.18 17.97 0 5 13 25 139 ▇▂▁▁▁
CBCL_SCR_SYN_TOTPROB_T 8 1 45.85 11.34 24 38 45 53 83 ▃▇▆▂▁
CBCL_SCR_SYN_TOTPROB_NM 8 1 0.00 0.01 0 0 0 0 1 ▇▁▁▁▁
CBCL_SCR_DSM5_DEPRESS_R 8 1 1.27 2.01 0 0 0 2 19 ▇▁▁▁▁
CBCL_SCR_DSM5_DEPRESS_T 8 1 53.60 5.73 50 50 50 56 89 ▇▁▁▁▁
CBCL_SCR_DSM5_DEPRESS_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_ANXDISORD_R 8 1 2.06 2.43 0 0 1 3 17 ▇▂▁▁▁
CBCL_SCR_DSM5_ANXDISORD_T 8 1 53.49 6.13 50 50 51 54 97 ▇▁▁▁▁
CBCL_SCR_DSM5_ANXDISORD_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_SOMATICPR_R 8 1 1.08 1.51 0 0 0 2 11 ▇▁▁▁▁
CBCL_SCR_DSM5_SOMATICPR_T 8 1 55.46 6.62 50 50 50 61 90 ▇▂▁▁▁
CBCL_SCR_DSM5_SOMATICPR_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_ADHD_R 8 1 2.62 2.97 0 0 2 4 14 ▇▃▂▁▁
CBCL_SCR_DSM5_ADHD_T 8 1 53.23 5.63 50 50 50 55 80 ▇▁▁▁▁
CBCL_SCR_DSM5_ADHD_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_OPPOSIT_R 8 1 1.77 2.04 0 0 1 3 10 ▇▂▁▁▁
CBCL_SCR_DSM5_OPPOSIT_T 8 1 53.47 5.41 50 50 51 55 80 ▇▁▁▁▁
CBCL_SCR_DSM5_OPPOSIT_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_CONDUCT_R 8 1 1.28 2.36 0 0 0 2 25 ▇▁▁▁▁
CBCL_SCR_DSM5_CONDUCT_T 8 1 53.03 5.53 50 50 50 54 89 ▇▁▁▁▁
CBCL_SCR_DSM5_CONDUCT_NM 8 1 0.00 0.01 0 0 0 0 1 ▇▁▁▁▁
CBCL_SCR_07_SCT_R 8 1 0.52 1.00 0 0 0 1 8 ▇▁▁▁▁
CBCL_SCR_07_SCT_T 8 1 53.02 5.41 50 50 50 56 80 ▇▁▁▁▁
CBCL_SCR_07_SCT_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_07_OCD_R 8 1 1.34 1.82 0 0 1 2 14 ▇▂▁▁▁
CBCL_SCR_07_OCD_T 8 1 53.69 6.12 50 50 51 55 94 ▇▁▁▁▁
CBCL_SCR_07_OCD_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_07_STRESS_R 8 1 2.90 3.35 0 0 2 4 24 ▇▂▁▁▁
CBCL_SCR_07_STRESS_T 8 1 53.32 6.00 50 50 50 54 93 ▇▁▁▁▁
CBCL_SCR_07_STRESS_NM 8 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_sum_scores %>% filter(EVENTNAME =="2_year_follow_up_y_arm_1") %>% dplyr::select(-1:-8)  %>%
 skimr::skim()
Data summary
Name Piped data
Number of rows 10414
Number of columns 81
_______________________
Column type frequency:
character 20
logical 1
numeric 60
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
CBCL_SCR_SYN_ANXDEP_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_WITHDEP_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_SOMATIC_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_SOCIAL_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_THOUGHT_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_ATTENTION_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_RULEBREAK_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_AGGRESSIVE_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_INTERNAL_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_EXTERNAL_M 0 1 0 0 10414 1 0
CBCL_SCR_SYN_TOTPROB_M 0 1 0 0 10414 1 0
CBCL_SCR_DSM5_DEPRESS_M 0 1 0 0 10414 1 0
CBCL_SCR_DSM5_ANXDISORD_M 0 1 0 0 10414 1 0
CBCL_SCR_DSM5_SOMATICPR_M 0 1 0 0 10414 1 0
CBCL_SCR_DSM5_ADHD_M 0 1 0 0 10414 1 0
CBCL_SCR_DSM5_OPPOSIT_M 0 1 0 0 10414 1 0
CBCL_SCR_DSM5_CONDUCT_M 0 1 0 0 10414 1 0
CBCL_SCR_07_SCT_M 0 1 0 0 10414 1 0
CBCL_SCR_07_OCD_M 0 1 0 0 10414 1 0
CBCL_SCR_07_STRESS_M 0 1 0 0 10414 1 0

Variable type: logical

skim_variable n_missing complete_rate mean count
CBCL_SCR_07_STRESS_NM_2 10414 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CBCL_SCR_SYN_ANXDEP_R 2329 0.78 2.32 2.96 0 0 1 3 22 ▇▁▁▁▁
CBCL_SCR_SYN_ANXDEP_T 2329 0.78 53.20 5.77 50 50 50 54 92 ▇▁▁▁▁
CBCL_SCR_SYN_ANXDEP_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_WITHDEP_R 2329 0.78 1.22 1.91 0 0 0 2 16 ▇▁▁▁▁
CBCL_SCR_SYN_WITHDEP_T 2329 0.78 53.47 5.73 50 50 50 54 100 ▇▁▁▁▁
CBCL_SCR_SYN_WITHDEP_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_SOMATIC_R 2329 0.78 1.40 1.92 0 0 1 2 16 ▇▁▁▁▁
CBCL_SCR_SYN_SOMATIC_T 2329 0.78 54.59 5.90 50 50 53 57 88 ▇▂▁▁▁
CBCL_SCR_SYN_SOMATIC_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_SOCIAL_R 2329 0.78 1.31 2.07 0 0 0 2 17 ▇▁▁▁▁
CBCL_SCR_SYN_SOCIAL_T 2329 0.78 52.56 4.73 50 50 50 53 89 ▇▁▁▁▁
CBCL_SCR_SYN_SOCIAL_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_THOUGHT_R 2329 0.78 1.44 2.07 0 0 1 2 22 ▇▁▁▁▁
CBCL_SCR_SYN_THOUGHT_T 2329 0.78 53.61 5.72 50 50 51 55 89 ▇▁▁▁▁
CBCL_SCR_SYN_THOUGHT_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_ATTENTION_R 2329 0.78 2.70 3.31 0 0 1 4 19 ▇▂▁▁▁
CBCL_SCR_SYN_ATTENTION_T 2329 0.78 53.47 5.58 50 50 51 55 97 ▇▁▁▁▁
CBCL_SCR_SYN_ATTENTION_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_RULEBREAK_R 2329 0.78 1.06 1.84 0 0 0 1 23 ▇▁▁▁▁
CBCL_SCR_SYN_RULEBREAK_T 2329 0.78 52.11 4.17 50 50 50 52 84 ▇▁▁▁▁
CBCL_SCR_SYN_RULEBREAK_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_AGGRESSIVE_R 2329 0.78 2.87 4.02 0 0 1 4 33 ▇▁▁▁▁
CBCL_SCR_SYN_AGGRESSIVE_T 2329 0.78 52.37 4.96 50 50 50 52 95 ▇▁▁▁▁
CBCL_SCR_SYN_AGGRESSIVE_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_INTERNAL_R 2329 0.78 4.94 5.62 0 1 3 7 50 ▇▁▁▁▁
CBCL_SCR_SYN_INTERNAL_T 2329 0.78 47.77 10.50 33 40 47 54 90 ▇▇▃▁▁
CBCL_SCR_SYN_INTERNAL_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_EXTERNAL_R 2329 0.78 3.93 5.52 0 0 2 5 50 ▇▁▁▁▁
CBCL_SCR_SYN_EXTERNAL_T 2329 0.78 44.48 9.80 33 34 44 51 83 ▇▅▂▁▁
CBCL_SCR_SYN_EXTERNAL_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_SYN_TOTPROB_R 2329 0.78 16.43 17.06 0 4 11 22 161 ▇▁▁▁▁
CBCL_SCR_SYN_TOTPROB_T 2329 0.78 44.77 11.27 24 37 45 52 88 ▅▇▅▁▁
CBCL_SCR_SYN_TOTPROB_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_DEPRESS_R 2329 0.78 1.47 2.24 0 0 1 2 19 ▇▁▁▁▁
CBCL_SCR_DSM5_DEPRESS_T 2329 0.78 53.75 5.88 50 50 51 56 89 ▇▁▁▁▁
CBCL_SCR_DSM5_DEPRESS_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_ANXDISORD_R 2329 0.78 1.81 2.31 0 0 1 3 18 ▇▂▁▁▁
CBCL_SCR_DSM5_ANXDISORD_T 2329 0.78 53.35 5.89 50 50 51 53 100 ▇▁▁▁▁
CBCL_SCR_DSM5_ANXDISORD_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_SOMATICPR_R 2329 0.78 1.05 1.48 0 0 0 2 11 ▇▁▁▁▁
CBCL_SCR_DSM5_SOMATICPR_T 2329 0.78 54.99 6.36 50 50 50 59 90 ▇▂▁▁▁
CBCL_SCR_DSM5_SOMATICPR_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_ADHD_R 2329 0.78 2.29 2.78 0 0 1 4 14 ▇▃▁▁▁
CBCL_SCR_DSM5_ADHD_T 2329 0.78 53.07 5.33 50 50 50 55 80 ▇▁▁▁▁
CBCL_SCR_DSM5_ADHD_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_OPPOSIT_R 2329 0.78 1.60 1.95 0 0 1 3 10 ▇▂▁▁▁
CBCL_SCR_DSM5_OPPOSIT_T 2329 0.78 53.07 5.03 50 50 51 55 80 ▇▁▁▁▁
CBCL_SCR_DSM5_OPPOSIT_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_DSM5_CONDUCT_R 2329 0.78 1.13 2.23 0 0 0 1 25 ▇▁▁▁▁
CBCL_SCR_DSM5_CONDUCT_T 2329 0.78 52.40 4.81 50 50 50 52 87 ▇▁▁▁▁
CBCL_SCR_DSM5_CONDUCT_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_07_SCT_R 2329 0.78 0.50 0.97 0 0 0 1 8 ▇▁▁▁▁
CBCL_SCR_07_SCT_T 2329 0.78 52.70 5.02 50 50 50 55 80 ▇▁▁▁▁
CBCL_SCR_07_SCT_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_07_OCD_R 2329 0.78 1.24 1.75 0 0 1 2 16 ▇▁▁▁▁
CBCL_SCR_07_OCD_T 2329 0.78 53.30 5.72 50 50 51 55 100 ▇▁▁▁▁
CBCL_SCR_07_OCD_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_SCR_07_STRESS_R 2329 0.78 2.74 3.25 0 0 2 4 28 ▇▁▁▁▁
CBCL_SCR_07_STRESS_T 2329 0.78 52.92 5.53 50 50 50 54 100 ▇▁▁▁▁
CBCL_SCR_07_STRESS_NM 2329 0.78 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CBCL_sum_score_select <- CBCL_sum_scores %>% dplyr::select(all_of(c("SUBJECTKEY","EVENTNAME", 
                             "CBCL_SCR_SYN_ANXDEP_R", "CBCL_SCR_SYN_WITHDEP_R", "CBCL_SCR_SYN_SOMATIC_R", "CBCL_SCR_SYN_SOCIAL_R",     
                             "CBCL_SCR_SYN_THOUGHT_R","CBCL_SCR_SYN_ATTENTION_R",  "CBCL_SCR_SYN_RULEBREAK_R",  "CBCL_SCR_SYN_AGGRESSIVE_R")))

2.8 Join data

all_sum_vars <- 
  plyr::join_all(list(Siteinfo, ASRPrecomputedSelected,BISBAS,UPPS,CBCL_sum_score_select,vision_idx,ACSselected), 
                 by=c('SUBJECTKEY','EVENTNAME'), type='full') %>%
  filter(visionProb != 1|is.na(visionProb)) %>% #remove subjects with eyesight problems 
  dplyr::select(-visionProb)%>%
  filter(EVENTNAME %in% c("baseline_year_1_arm_1","2_year_follow_up_y_arm_1"))## select the event for analysis
## there are all NAs for event name of 42_month_follow_up_arm_1 and 30_month_follow_up_arm_1


all_sum_vars %>%  filter(EVENTNAME =="baseline_year_1_arm_1") %>% dplyr::select(-1:-2) %>%
   skimr::skim()
Data summary
Name Piped data
Number of rows 11845
Number of columns 55
_______________________
Column type frequency:
character 6
factor 2
numeric 47
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SUBJECTKEY 0 1 12 16 0 11845 0
SRC_SUBJECT_ID 0 1 16 16 0 11845 0
INTERVIEW_DATE 0 1 9 9 0 756 0
SEX 0 1 1 1 0 2 0
EVENTNAME 0 1 21 21 0 1 0
SITE_ID_L 0 1 6 6 0 22 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
RACE_ETHNICITY 2 1 FALSE 5 Whi: 6171, His: 2403, Bla: 1775, Oth: 1243
REL_FAMILY_ID 0 1 FALSE 9832 373: 5, 749: 4, 11: 3, 400: 3

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INTERVIEW_AGE 0 1.00 118.98 7.49 107.00 112.00 119.00 126.00 133.00 ▇▆▆▆▆
SCHED_DELAY 13 1.00 7.00 0.10 1.00 7.00 7.00 7.00 7.00 ▁▁▁▁▇
SCHED_HYBRID 11845 0.00 NaN NA NA NA NA NA NA
ASR_SCR_PERSTR_R 5 1.00 16.52 3.52 0.00 15.00 17.00 19.00 22.00 ▁▁▂▇▇
ASR_SCR_ANXDEP_R 5 1.00 5.04 4.93 0.00 1.00 4.00 7.00 35.00 ▇▂▁▁▁
ASR_SCR_WITHDRAWN_R 5 1.00 1.56 2.11 0.00 0.00 1.00 2.00 17.00 ▇▁▁▁▁
ASR_SCR_SOMATIC_R 5 1.00 2.94 3.18 0.00 1.00 2.00 4.00 23.00 ▇▂▁▁▁
ASR_SCR_THOUGHT_R 5 1.00 1.43 1.85 0.00 0.00 1.00 2.00 18.00 ▇▁▁▁▁
ASR_SCR_ATTENTION_R 5 1.00 4.63 4.26 0.00 1.00 4.00 7.00 27.00 ▇▃▁▁▁
ASR_SCR_AGGRESSIVE_R 5 1.00 3.35 3.57 0.00 1.00 2.00 5.00 25.00 ▇▂▁▁▁
ASR_SCR_RULEBREAK_R 5 1.00 1.19 1.85 0.00 0.00 0.00 2.00 18.00 ▇▁▁▁▁
ASR_SCR_INTRUSIVE_R 5 1.00 1.00 1.43 0.00 0.00 0.00 2.00 10.00 ▇▁▁▁▁
ASR_SCR_INTERNAL_R 5 1.00 9.53 8.74 0.00 3.00 7.00 13.00 70.00 ▇▂▁▁▁
ASR_SCR_EXTERNAL_R 5 1.00 5.55 5.62 0.00 1.00 4.00 8.00 39.00 ▇▂▁▁▁
ASR_SCR_TOTPROB_R 4 1.00 21.15 17.96 0.00 8.00 16.00 29.00 154.00 ▇▂▁▁▁
ASR_SCR_DEPRESS_R 5 1.00 3.97 3.66 0.00 1.00 3.00 6.00 28.00 ▇▂▁▁▁
ASR_SCR_ANXDISORD_R 5 1.00 3.76 2.56 0.00 2.00 3.00 5.00 12.00 ▇▆▆▂▁
ASR_SCR_SOMATICPR_R 5 1.00 1.91 2.37 0.00 0.00 1.00 3.00 18.00 ▇▂▁▁▁
ASR_SCR_AVOIDANT_R 5 1.00 2.01 2.15 0.00 0.00 1.00 3.00 14.00 ▇▃▁▁▁
ASR_SCR_ADHD_R 5 1.00 3.80 3.64 0.00 1.00 3.00 6.00 25.00 ▇▂▁▁▁
ASR_SCR_ANTISOCIAL_R 5 1.00 2.11 2.49 0.00 0.00 1.00 3.00 22.00 ▇▁▁▁▁
ASR_SCR_INATTENTION_R 5 1.00 2.35 2.35 0.00 0.00 2.00 4.00 14.00 ▇▃▁▁▁
ASR_SCR_HYPERACTIVE_R 5 1.00 1.46 1.73 0.00 0.00 1.00 2.00 12.00 ▇▂▁▁▁
BISAvg 22 1.00 1.38 0.71 0.00 0.75 1.25 2.00 3.00 ▃▅▇▃▂
BASRRAvg 23 1.00 2.20 0.62 0.00 1.75 2.25 2.75 3.00 ▁▁▅▆▇
BASDriveAvg 23 1.00 1.03 0.77 0.00 0.50 1.00 1.50 3.00 ▇▆▅▂▂
BASFunAvg 23 1.00 1.43 0.66 0.00 1.00 1.50 1.75 3.00 ▂▅▇▃▂
BASAllAvg 23 1.00 1.55 0.54 0.00 1.17 1.50 1.92 3.00 ▁▅▇▅▂
UPPS_Y_SS_NEGATIVE_URGENCY 23 1.00 8.49 2.65 4.00 7.00 8.00 10.00 16.00 ▆▇▇▂▁
UPPS_Y_SS_LACK_OF_PLANNING 23 1.00 7.74 2.38 4.00 6.00 8.00 9.00 16.00 ▆▇▅▁▁
UPPS_Y_SS_SENSATION_SEEKING 23 1.00 9.77 2.68 4.00 8.00 10.00 12.00 16.00 ▂▅▇▃▂
UPPS_Y_SS_POSITIVE_URGENCY 23 1.00 7.99 2.96 4.00 6.00 8.00 10.00 16.00 ▇▆▆▂▁
UPPS_Y_SS_LACK_OF_PERSEVERANCE 23 1.00 7.04 2.25 4.00 5.00 7.00 8.00 16.00 ▇▆▃▁▁
CBCL_SCR_SYN_ANXDEP_R 8 1.00 2.52 3.07 0.00 0.00 1.00 4.00 26.00 ▇▁▁▁▁
CBCL_SCR_SYN_WITHDEP_R 8 1.00 1.04 1.71 0.00 0.00 0.00 1.00 15.00 ▇▁▁▁▁
CBCL_SCR_SYN_SOMATIC_R 8 1.00 1.50 1.95 0.00 0.00 1.00 2.00 16.00 ▇▁▁▁▁
CBCL_SCR_SYN_SOCIAL_R 8 1.00 1.63 2.28 0.00 0.00 1.00 2.00 18.00 ▇▁▁▁▁
CBCL_SCR_SYN_THOUGHT_R 8 1.00 1.62 2.20 0.00 0.00 1.00 2.00 18.00 ▇▁▁▁▁
CBCL_SCR_SYN_ATTENTION_R 8 1.00 2.98 3.49 0.00 0.00 2.00 5.00 20.00 ▇▂▁▁▁
CBCL_SCR_SYN_RULEBREAK_R 8 1.00 1.19 1.86 0.00 0.00 0.00 2.00 20.00 ▇▁▁▁▁
CBCL_SCR_SYN_AGGRESSIVE_R 8 1.00 3.26 4.36 0.00 0.00 2.00 5.00 36.00 ▇▁▁▁▁
ABCD_SVS01_ID 0 1.00 50523.16 6430.86 39373.00 44951.00 50525.00 56092.00 61660.00 ▇▇▇▇▇
SNELLEN_AID_Y 21 1.00 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
SNELLEN_AIDPRES_Y 8987 0.24 0.62 0.49 0.00 0.00 1.00 1.00 1.00 ▅▁▁▁▇
SNELLEN_VA_Y 28 1.00 6.84 1.44 2.00 6.00 7.00 8.00 11.00 ▁▃▇▆▁
VIS_FLG 11083 0.06 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
ACS_RAKED_PROPENSITY_SCORE 0 1.00 691.25 350.96 161.36 448.94 619.31 821.72 1778.92 ▅▇▂▂▁
all_sum_vars %>%  filter(EVENTNAME =="2_year_follow_up_y_arm_1") %>% dplyr::select(-1:-2) %>%
   skimr::skim()
Data summary
Name Piped data
Number of rows 10387
Number of columns 55
_______________________
Column type frequency:
character 6
factor 2
numeric 47
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SUBJECTKEY 0 1 12 16 0 10387 0
SRC_SUBJECT_ID 0 1 16 16 0 10387 0
INTERVIEW_DATE 0 1 9 9 0 788 0
SEX 0 1 1 1 0 2 0
EVENTNAME 0 1 24 24 0 1 0
SITE_ID_L 0 1 6 6 0 21 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
RACE_ETHNICITY 10387 0 FALSE 0 Whi: 0, Bla: 0, His: 0, Asi: 0
REL_FAMILY_ID 10387 0 FALSE 0 0: 0, 1: 0, 3: 0, 4: 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INTERVIEW_AGE 0 1.00 144.04 7.95 127 137.00 144.00 151.00 168 ▅▇▇▅▁
SCHED_DELAY 0 1.00 7.55 0.89 7 7.00 7.00 9.00 9 ▇▁▁▁▃
SCHED_HYBRID 7674 0.26 0.49 0.50 0 0.00 0.00 1.00 1 ▇▁▁▁▇
ASR_SCR_PERSTR_R 57 0.99 16.10 3.70 0 14.00 17.00 19.00 22 ▁▁▃▇▇
ASR_SCR_ANXDEP_R 57 0.99 5.03 4.96 0 1.00 4.00 7.00 34 ▇▂▁▁▁
ASR_SCR_WITHDRAWN_R 57 0.99 1.56 2.06 0 0.00 1.00 2.00 18 ▇▁▁▁▁
ASR_SCR_SOMATIC_R 57 0.99 2.95 3.17 0 1.00 2.00 4.00 22 ▇▂▁▁▁
ASR_SCR_THOUGHT_R 57 0.99 1.28 1.71 0 0.00 1.00 2.00 17 ▇▁▁▁▁
ASR_SCR_ATTENTION_R 57 0.99 4.59 4.22 0 1.00 3.00 7.00 29 ▇▃▁▁▁
ASR_SCR_AGGRESSIVE_R 57 0.99 3.13 3.37 0 1.00 2.00 5.00 26 ▇▂▁▁▁
ASR_SCR_RULEBREAK_R 57 0.99 1.06 1.73 0 0.00 0.00 2.00 21 ▇▁▁▁▁
ASR_SCR_INTRUSIVE_R 57 0.99 0.95 1.39 0 0.00 0.00 1.00 11 ▇▁▁▁▁
ASR_SCR_INTERNAL_R 57 0.99 9.55 8.72 0 3.00 7.00 13.00 65 ▇▂▁▁▁
ASR_SCR_EXTERNAL_R 57 0.99 5.14 5.30 0 1.00 4.00 7.00 57 ▇▁▁▁▁
ASR_SCR_TOTPROB_R 57 0.99 20.55 17.46 0 8.00 16.00 28.00 133 ▇▂▁▁▁
ASR_SCR_DEPRESS_R 57 0.99 4.00 3.67 0 1.00 3.00 6.00 25 ▇▂▁▁▁
ASR_SCR_ANXDISORD_R 57 0.99 3.64 2.53 0 2.00 3.00 5.00 12 ▇▆▆▁▁
ASR_SCR_SOMATICPR_R 57 0.99 1.92 2.35 0 0.00 1.00 3.00 17 ▇▂▁▁▁
ASR_SCR_AVOIDANT_R 57 0.99 1.98 2.11 0 0.00 1.00 3.00 13 ▇▃▁▁▁
ASR_SCR_ADHD_R 57 0.99 3.68 3.54 0 1.00 3.00 5.00 23 ▇▃▁▁▁
ASR_SCR_ANTISOCIAL_R 57 0.99 2.00 2.35 0 0.00 1.00 3.00 30 ▇▁▁▁▁
ASR_SCR_INATTENTION_R 57 0.99 2.35 2.33 0 0.00 2.00 4.00 14 ▇▃▁▁▁
ASR_SCR_HYPERACTIVE_R 57 0.99 1.33 1.66 0 0.00 1.00 2.00 12 ▇▁▁▁▁
BISAvg 30 1.00 1.24 0.71 0 0.75 1.25 1.75 3 ▅▆▇▃▂
BASRRAvg 30 1.00 1.96 0.65 0 1.50 2.00 2.50 3 ▁▂▇▇▇
BASDriveAvg 30 1.00 0.92 0.70 0 0.50 0.75 1.25 3 ▇▆▅▁▁
BASFunAvg 30 1.00 1.14 0.65 0 0.75 1.00 1.50 3 ▆▇▇▂▁
BASAllAvg 30 1.00 1.34 0.53 0 1.00 1.33 1.67 3 ▂▇▇▃▁
UPPS_Y_SS_NEGATIVE_URGENCY 30 1.00 7.76 2.34 4 6.00 8.00 9.00 16 ▇▇▇▁▁
UPPS_Y_SS_LACK_OF_PLANNING 30 1.00 7.77 2.24 4 6.00 8.00 9.00 16 ▅▇▃▁▁
UPPS_Y_SS_SENSATION_SEEKING 29 1.00 9.49 2.68 3 8.00 9.00 11.00 16 ▂▇▇▇▂
UPPS_Y_SS_POSITIVE_URGENCY 30 1.00 7.39 2.67 4 5.00 8.00 9.00 16 ▇▆▅▁▁
UPPS_Y_SS_LACK_OF_PERSEVERANCE 30 1.00 6.98 2.25 4 5.00 7.00 8.00 16 ▇▇▃▁▁
CBCL_SCR_SYN_ANXDEP_R 2327 0.78 2.32 2.96 0 0.00 1.00 3.00 22 ▇▁▁▁▁
CBCL_SCR_SYN_WITHDEP_R 2327 0.78 1.21 1.91 0 0.00 0.00 2.00 16 ▇▁▁▁▁
CBCL_SCR_SYN_SOMATIC_R 2327 0.78 1.40 1.91 0 0.00 1.00 2.00 16 ▇▁▁▁▁
CBCL_SCR_SYN_SOCIAL_R 2327 0.78 1.31 2.07 0 0.00 0.00 2.00 17 ▇▁▁▁▁
CBCL_SCR_SYN_THOUGHT_R 2327 0.78 1.44 2.07 0 0.00 1.00 2.00 22 ▇▁▁▁▁
CBCL_SCR_SYN_ATTENTION_R 2327 0.78 2.70 3.31 0 0.00 1.00 4.00 19 ▇▂▁▁▁
CBCL_SCR_SYN_RULEBREAK_R 2327 0.78 1.06 1.84 0 0.00 0.00 1.00 23 ▇▁▁▁▁
CBCL_SCR_SYN_AGGRESSIVE_R 2327 0.78 2.87 4.02 0 0.00 1.00 4.00 33 ▇▁▁▁▁
ABCD_SVS01_ID 0 1.00 50511.20 6441.84 39372 44933.00 50506.00 56094.00 61661 ▇▇▇▇▇
SNELLEN_AID_Y 1626 0.84 0.28 0.45 0 0.00 0.00 1.00 1 ▇▁▁▁▃
SNELLEN_AIDPRES_Y 7945 0.24 0.67 0.47 0 0.00 1.00 1.00 1 ▃▁▁▁▇
SNELLEN_VA_Y 1625 0.84 7.11 1.53 2 6.00 7.00 8.00 11 ▁▃▇▇▁
VIS_FLG 9840 0.05 1.00 0.00 1 1.00 1.00 1.00 1 ▁▁▇▁▁
ACS_RAKED_PROPENSITY_SCORE 10387 0.00 NaN NA NA NA NA NA NA

2.9 preprocess site

make sure that there are no members from the same family at different sites

all_sum_vars_baseline <- all_sum_vars %>% filter(EVENTNAME =="baseline_year_1_arm_1") 

all_sum_vars_baseline %>% count(SITE_ID_L)
##    SITE_ID_L    n
## 1     site01  405
## 2     site02  558
## 3     site03  629
## 4     site04  745
## 5     site05  377
## 6     site06  580
## 7     site07  339
## 8     site08  350
## 9     site09  433
## 10    site10  736
## 11    site11  448
## 12    site12  600
## 13    site13  726
## 14    site14  606
## 15    site15  457
## 16    site16 1010
## 17    site17  577
## 18    site18  384
## 19    site19  549
## 20    site20  701
## 21    site21  599
## 22    site22   36
# check if there are members from the same family at different sites. There are 6 of them.
all_sum_vars_baseline %>%
  drop_na(SITE_ID_L) %>%
  filter(SITE_ID_L != "site22") %>%
  count(REL_FAMILY_ID, SITE_ID_L) %>%
  spread(SITE_ID_L, n, fill = 0) %>%
  dplyr::select(-REL_FAMILY_ID) %>% 
       as.matrix %>% 
       crossprod
##        site01 site02 site03 site04 site05 site06 site07 site08 site09 site10
## site01    495      0      0      0      0      0      0      0      0      0
## site02      0   1048      0      0      0      0      0      0      0      0
## site03      0      0    751      0      0      0      0      0      0      0
## site04      0      0      0    955      0      0      0      0      0      0
## site05      0      0      0      0    473      0      0      0      0      0
## site06      0      0      0      0      0    688      0      0      0      0
## site07      0      0      0      0      0      0    425      0      0      0
## site08      0      0      0      0      0      0      0    434      0      0
## site09      0      0      0      0      0      0      0      0    479      0
## site10      0      0      0      0      0      0      0      0      0    910
## site11      0      0      0      0      0      0      0      0      0      0
## site12      0      0      0      0      0      0      0      0      0      0
## site13      0      0      0      0      0      0      0      0      0      0
## site14      0      0      0      0      0      0      0      0      0      0
## site15      0      0      0      0      0      0      0      0      0      0
## site16      0      0      0      0      0      0      0      0      0      0
## site17      0      0      0      0      0      0      0      0      0      0
## site18      0      0      0      0      0      0      0      0      0      0
## site19      0      0      0      0      0      0      0      0      0      0
## site20      0      0      0      0      0      0      0      0      0      0
## site21      0      0      0      0      0      0      0      0      0      0
##        site11 site12 site13 site14 site15 site16 site17 site18 site19 site20
## site01      0      0      0      0      0      0      0      0      0      0
## site02      0      0      0      0      0      0      0      0      0      0
## site03      0      0      0      0      0      0      0      0      0      0
## site04      0      0      0      0      0      0      0      0      0      0
## site05      0      0      0      0      0      0      0      0      0      0
## site06      0      0      0      0      0      0      0      0      0      0
## site07      0      0      0      0      0      0      0      0      0      0
## site08      0      0      0      0      0      0      0      0      0      0
## site09      0      0      0      0      0      0      0      0      0      0
## site10      0      0      0      0      0      0      0      0      0      0
## site11    562      0      0      0      0      0      0      0      0      0
## site12      0    746      0      0      0      0      0      0      0      0
## site13      0      0    888      0      0      0      0      0      0      0
## site14      0      0      0   1106      0      0      0      0      0      0
## site15      0      0      0      0    549      0      0      0      0      0
## site16      0      0      0      0      0   1394      0      0      0      0
## site17      0      0      0      0      0      0    697      0      0      0
## site18      0      0      0      0      0      0      0    448      0      0
## site19      0      0      0      0      0      0      0      0   1015      0
## site20      0      0      0      0      0      0      0      0      0   1187
## site21      0      0      0      0      0      0      0      0      0      0
##        site21
## site01      0
## site02      0
## site03      0
## site04      0
## site05      0
## site06      0
## site07      0
## site08      0
## site09      0
## site10      0
## site11      0
## site12      0
## site13      0
## site14      0
## site15      0
## site16      0
## site17      0
## site18      0
## site19      0
## site20      0
## site21    723
#below will remove those 6 
all_sum_vars_baseline_no_dup <- all_sum_vars_baseline %>%
  drop_na(SITE_ID_L) %>%
  filter(SITE_ID_L != "site22") %>%
  group_by(REL_FAMILY_ID) %>% 
  nest(SITE_ID_L, .key="SITE_ID_L") %>%
  mutate(dup = ifelse(length(c(unlist(SITE_ID_L)))==1,0,
                      ifelse(length(unique(c(unlist(SITE_ID_L)))) > 1,1,0))) %>%
  unnest(SITE_ID_L) %>%
  ungroup()
## Warning: Supplying `...` without names was deprecated in tidyr 1.0.0.
## ℹ Please specify a name for each selection.
## ℹ Did you want `SITE_ID_L = SITE_ID_L`?
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
family_exclude <- unique(all_sum_vars_baseline_no_dup$REL_FAMILY_ID[which(all_sum_vars_baseline_no_dup$dup==1)])

all_sum_vars_no_dup <- all_sum_vars%>%
                 filter(!REL_FAMILY_ID %in% family_exclude)%>%
    drop_na(SITE_ID_L) %>%
  filter(SITE_ID_L != "site22")

3 Modeling for Mental health features fit with the cognitive abilities

Samples: REL_FAMILY_ID (9856 Levels) SITE_ID_L (need to remove 22nd site. having too few subjects) ALSO make sure about EVENTNAME

Target: Factor analysis of cognitive abilities: gfactor

72 Features:

SEX Ethnicity - Black, Hispanic, Asian, Other

Features by categories:

Parent Mental Health(9): ASR_SCR_PERSTR_R ASR_SCR_ANXDEP_R ASR_SCR_WITHDRAWN_R ASR_SCR_SOMATIC_R ASR_SCR_THOUGHT_R ASR_SCR_ATTENTION_R ASR_SCR_AGGRESSIVE_R ASR_SCR_RULEBREAK_R ASR_SCR_INTRUSIVE_R

Child Personality(9): BISAvg BASRRAvg BASDriveAvg BASFunAvg UPPS_Y_SS_NEGATIVE_URGENCY UPPS_Y_SS_LACK_OF_PLANNING UPPS_Y_SS_SENSATION_SEEKING UPPS_Y_SS_POSITIVE_URGENCY UPPS_Y_SS_LACK_OF_PERSEVERANCE

CBCL sum scores

“CBCL_SCR_SYN_ANXDEP_R” “CBCL_SCR_SYN_WITHDEP_R” “CBCL_SCR_SYN_SOMATIC_R” “CBCL_SCR_SYN_SOCIAL_R” “CBCL_SCR_SYN_THOUGHT_R” “CBCL_SCR_SYN_ATTENTION_R” “CBCL_SCR_SYN_RULEBREAK_R” “CBCL_SCR_SYN_AGGRESSIVE_R”

3.1 process data for modelling

features: Child Personality(9) CBCL sum scores(8) Parent Mental Health(9)

Set up vector of names based on different categories of features

subj_info <-  c("SUBJECTKEY","EVENTNAME","SITE_ID_L")   
## features name vec

CBCL_sum_scores_names <-c("CBCL_SCR_SYN_ANXDEP_R","CBCL_SCR_SYN_WITHDEP_R","CBCL_SCR_SYN_SOMATIC_R","CBCL_SCR_SYN_SOCIAL_R","CBCL_SCR_SYN_THOUGHT_R","CBCL_SCR_SYN_ATTENTION_R","CBCL_SCR_SYN_RULEBREAK_R","CBCL_SCR_SYN_AGGRESSIVE_R")


Parent_Mental_Health_names<- c("ASR_SCR_PERSTR_R","ASR_SCR_ANXDEP_R","ASR_SCR_WITHDRAWN_R","ASR_SCR_SOMATIC_R","ASR_SCR_THOUGHT_R","ASR_SCR_ATTENTION_R","ASR_SCR_AGGRESSIVE_R","ASR_SCR_RULEBREAK_R","ASR_SCR_INTRUSIVE_R")

Child_Personality_names<- c("BISAvg","BASRRAvg","BASDriveAvg","BASFunAvg","UPPS_Y_SS_NEGATIVE_URGENCY","UPPS_Y_SS_LACK_OF_PLANNING","UPPS_Y_SS_SENSATION_SEEKING","UPPS_Y_SS_POSITIVE_URGENCY","UPPS_Y_SS_LACK_OF_PERSEVERANCE")

## features related to subject information

features <- c(CBCL_sum_scores_names,Parent_Mental_Health_names,Child_Personality_names)


all_features_no_dup <- all_sum_vars_no_dup %>% dplyr::select(all_of(subj_info),all_of(CBCL_sum_scores_names),all_of(Parent_Mental_Health_names),all_of(Child_Personality_names))

### select the data table with subject information and variable names

all_selected_no_dup <- all_sum_vars_no_dup %>% dplyr::select(all_of(subj_info),all_of(CBCL_sum_scores_names),all_of(Parent_Mental_Health_names),all_of(Child_Personality_names))

Making data splits based on the site.

site_col <- all_selected_no_dup  %>%
  distinct(SITE_ID_L) %>% 
  arrange(SITE_ID_L) 

site_list <- as.list(site_col$SITE_ID_L)

site_char <- as.character(unlist(site_col$SITE_ID_L))

split_func  <- function(site,data_input)
{ 
  train_indices <- which(data_input$SITE_ID_L != site)
  test_indices <- which(data_input$SITE_ID_L == site)
  
  indices <-
  list(analysis   = train_indices, 
       assessment = test_indices)
split <- make_splits(indices, data_input)
return(split)}

split_list <- purrr::map(site_list, ~split_func(.x,data_input =all_selected_no_dup ))


names(split_list) <- site_char

process features

The following data processing function does not include combat and removal of outliers. What is does is only scale baseline train and test data and then scale the followup train and test data with mean and standard deviation from baseline respectively.

processed_psychopathology_scale_seperate_features_only <- purrr::map(.x= split_list,
                                                              ~data_processing_cross_sites_seperate_no_combat_no_iqr(split_input = .)) 

3.2 loading the psychopathology scores

3.3 Loading gfactor

baseline_train_gfactor <- purrr::map(gfactor_list,"output_train_baseline")
baseline_test_gfactor <- purrr::map(gfactor_list,"output_test_baseline")
followup_train_gfactor <- purrr::map(gfactor_list,"output_train_followup")
followup_test_gfactor <- purrr::map(gfactor_list,"output_test_followup")

Join the loaded data

processed_train_baseline_scale_seperate_all <-processed_psychopathology_scale_seperate_features_only %>%
                             purrr::map(.,~filter(.,EVENTNAME=="baseline_year_1_arm_1"& fold =="train")%>%
                                   dplyr::select(-fold))


processed_test_baseline_scale_seperate_all <-processed_psychopathology_scale_seperate_features_only %>%
                             purrr::map(.,~filter(.,EVENTNAME=="baseline_year_1_arm_1"& fold =="test")%>%
                                   dplyr::select(-fold))


processed_train_followup_scale_seperate_all <- processed_psychopathology_scale_seperate_features_only %>%
                              purrr::map(.,~filter(.,EVENTNAME=="2_year_follow_up_y_arm_1"& fold =="train")%>%
                                   dplyr::select(-fold))


processed_test_followup_scale_seperate_all <- processed_psychopathology_scale_seperate_features_only %>%
                              purrr::map(.,~filter(.,EVENTNAME=="2_year_follow_up_y_arm_1"& fold =="test")%>%
                                   dplyr::select(-fold))

psychopathology_baseline_train <- purrr::map2(.x =baseline_train_gfactor ,
                                       .y=processed_train_baseline_scale_seperate_all, 
                                               ~left_join(.x,.y, by = "SUBJECTKEY")%>% drop_na())

psychopathology_baseline_test <- purrr::map2(.x =baseline_test_gfactor ,.y=processed_test_baseline_scale_seperate_all, 
                                               ~left_join(.x,.y, by ="SUBJECTKEY")%>% drop_na())

psychopathology_followup_train <- purrr::map2(.x =followup_train_gfactor ,
                                       .y=processed_train_followup_scale_seperate_all, 
                                               ~left_join(.x,.y, by = "SUBJECTKEY")%>% drop_na())


psychopathology_followup_test <- purrr::map2(.x =followup_test_gfactor ,
                                      .y=processed_test_followup_scale_seperate_all, 
                                               ~left_join(.x,.y, by = "SUBJECTKEY")%>% drop_na())



psychopathology_feature_names <- psychopathology_baseline_train[[1]] %>%
                                           dplyr::select( -all_of(subj_info),-"gfactor")%>%
                                           colnames()%>% 
                                           as.character()


psychopathology_baseline_train_select <- purrr::map(.x = psychopathology_baseline_train,
                                                     ~dplyr::select(.x,all_of(psychopathology_feature_names),"gfactor"))


psychopathology_baseline_test_select <- purrr::map(.x = psychopathology_baseline_test,
                                                     ~dplyr::select(.x,all_of(psychopathology_feature_names),"gfactor"))

psychopathology_followup_train_select <- purrr::map(.x = psychopathology_followup_train,
                                                     ~dplyr::select(.x,all_of(psychopathology_feature_names),"gfactor"))

psychopathology_followup_test_select <- purrr::map(.x = psychopathology_followup_test ,
                                                ~dplyr::select(.x,all_of(psychopathology_feature_names),"gfactor"))

4 Fitting partial least square regression models

4.1 Modelling

Baseline models and performance metrics.

### baseline
### setting up the recipe
features <- psychopathology_feature_names
psychopathology_baseline_recipe_list <- purrr::map(.x = psychopathology_baseline_train_select,
                             ~recipe_prep(train_input=.x)) 

#recipe_input <- psychopathology_baseline_recipe_list[[1]]
### pls tuning
psychopathology_pls_fit_baseline <- psychopathology_baseline_recipe_list %>% purrr::map(.,~pls_tune(recipe_input = .)) 

psychopathology_pls_fit_wf <- purrr::map(psychopathology_pls_fit_baseline,"pls_final_wf")

### model final fit and prediction
psychopathology_pls_pred_baseline <- purrr::pmap(list(psychopathology_baseline_recipe_list,
                                                         psychopathology_pls_fit_wf,
                                                         psychopathology_baseline_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 





psychopathology_pls_pred_baseline_results <- purrr::map(psychopathology_pls_pred_baseline,"model_predict")


psychopathology_pls_pred_baseline_train <- purrr::pmap(list(psychopathology_baseline_recipe_list,
                                                         psychopathology_pls_fit_wf,
                                                         psychopathology_baseline_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 





psychopathology_pls_pred_baseline_results_train <- purrr::map(psychopathology_pls_pred_baseline_train,"model_predict")
### compute the performance metric
psychopathology_baseline_metric <- purrr::map2(.x=psychopathology_pls_pred_baseline_results,
     .y=psychopathology_baseline_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      psychopathology_baseline_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in baseline") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in baseline
correlation tradrsq MAE RMSE site
0.3658249 0.1303598 0.7421831 0.9241757 site01
0.3313679 0.1016078 0.7432904 0.9476698 site02
0.3381908 0.1101639 0.7402534 0.9425222 site03
0.3678763 0.1333168 0.7312679 0.9303247 site04
0.5597447 0.2956101 0.6585767 0.8381226 site05
0.3410484 0.1077068 0.7405508 0.9443613 site06
0.4784056 0.2254262 0.6850812 0.8787395 site07
0.4093874 0.1668964 0.7210578 0.9113978 site08
0.3879802 0.1504608 0.7294099 0.9205311 site09
0.4028957 0.1622433 0.7113103 0.9134528 site10
0.4258926 0.1813245 0.7134233 0.9048017 site11
0.5021376 0.2485132 0.6750443 0.8668803 site12
0.4243384 0.1798012 0.7145275 0.9050138 site13
0.2501605 0.0387981 0.7721574 0.9795862 site14
0.3671954 0.1313395 0.7533609 0.9303344 site15
0.3353717 0.1049362 0.7334809 0.9455762 site16
0.3828547 0.1446389 0.7209690 0.9240162 site17
0.3260003 0.1001825 0.7566932 0.9473282 site18
0.3988269 0.1574067 0.7273404 0.9175321 site19
0.3984798 0.1576716 0.7191726 0.9171084 site20
0.4396583 0.1928774 0.7099830 0.8969044 site21
psychopathology_baseline_metric_avg <- average_metric_one_mod(metric_list =psychopathology_baseline_metric)

psychopathology_baseline_metric_avg_table <- psychopathology_baseline_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
avg_table_var_names <- c("correlation (sd)", "tradrsq (sd)","MAE (sd)","RMSE (sd)"  )

  psychopathology_baseline_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in baseline")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in baseline
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.392 (0.068) 0.153 (0.057) 0.724 (0.027) 0.918 (0.031)

Modelling followup

### setting up the recipe
psychopathology_followup_recipe_list <- purrr::map(.x = psychopathology_followup_train_select,
                             ~recipe_prep(train_input=.x)) 

#recipe_input <- psychopathology_baseline_recipe_list[[1]]
### pls tuning
psychopathology_pls_fit_followup <- psychopathology_followup_recipe_list %>% 
                                    purrr::map(.,~pls_tune(recipe_input = .)) 

psychopathology_pls_fit_wf_followup <- purrr::map(psychopathology_pls_fit_followup,"pls_final_wf")

### model final fit and prediction
psychopathology_pls_pred_followup <- purrr::pmap(list(psychopathology_followup_recipe_list,
                                                         psychopathology_pls_fit_wf_followup,
                                                         psychopathology_followup_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 





psychopathology_pls_pred_followup_results <- purrr::map(psychopathology_pls_pred_followup,"model_predict")


psychopathology_pls_pred_followup_train <- purrr::pmap(list(psychopathology_followup_recipe_list,
                                                         psychopathology_pls_fit_wf_followup,
                                                         psychopathology_followup_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 


psychopathology_pls_pred_followup_results_train <- purrr::map(psychopathology_pls_pred_followup_train,"model_predict")
### compute the performance metric
psychopathology_followup_metric <- purrr::map2(.x=psychopathology_pls_pred_followup_results,
     .y=psychopathology_followup_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      psychopathology_followup_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in followup") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in followup
correlation tradrsq MAE RMSE site
0.4952109 0.2431941 0.6122293 0.7813798 site01
0.3786713 0.1079337 0.7370283 0.9342754 site02
0.4003672 0.1470680 0.7223435 0.8859100 site03
0.4093904 0.1647772 0.7256146 0.9288682 site04
0.5372687 0.2853286 0.6396871 0.7840006 site05
0.2730276 0.0216919 0.6745511 0.8709119 site06
0.4502930 0.1768879 0.6599253 0.8607248 site07
0.3557054 0.1185730 0.6877080 0.8648319 site08
0.3198527 0.0728102 0.6923593 0.8783524 site09
0.4091455 0.1659586 0.6663097 0.8450050 site10
0.3219722 0.0863595 0.6984157 0.8997283 site11
0.5387924 0.2751461 0.6097954 0.7743932 site12
0.3996873 0.1341746 0.6821421 0.8498319 site13
0.2786360 0.0514307 0.7229119 0.9276650 site14
0.4819168 0.1625940 0.7736475 0.9633101 site15
0.3356247 0.0919820 0.7246159 0.9159794 site16
0.2866875 0.0184466 0.7355750 0.9204807 site17
0.4154152 0.1507645 0.7053084 0.8761722 site18
0.3515408 0.1095605 0.7807932 0.9774203 site19
0.4492067 0.2017771 0.7151134 0.9041720 site20
0.4311515 0.1754779 0.6439819 0.8270305 site21
psychopathology_followup_metric_avg <- average_metric_one_mod(metric_list =psychopathology_followup_metric)

psychopathology_followup_metric_avg_table <- psychopathology_followup_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
avg_table_var_names <- c("correlation (sd)", "tradrsq (sd)","MAE (sd)","RMSE (sd)"  )

  psychopathology_followup_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in followup")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in followup
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.396 (0.079) 0.141 (0.073) 0.696 (0.047) 0.88 (0.056)

4.2 Plotting the trace of performance metric against the number of factors.

Baseline

## get the model grid
psychopathology_pls_grid_baseline  <- purrr::map(psychopathology_pls_fit_baseline,"pls_grid")
psychopathology_pls_param_baseline  <- purrr::map(psychopathology_pls_fit_baseline,"best_pls_model")

factor_metric_plot <- function(grid_input, param_input){
  selected_comp <- param_input$num_comp
  
  comp_plot <-  grid_input %>% 
  collect_metrics() %>% 
  ggplot(aes(num_comp, mean, col = .metric)) +
  geom_point() +
  geom_line() +
  geom_vline(xintercept = selected_comp, size=1.5)+
  scale_x_continuous(n.breaks = 26) +
  labs(x = "Number of components",
       y = "Indicator",
       title = "Plot of RMSE vs number of components ",
       subtitle = paste0("Optimal number of components is ", selected_comp)) +
 facet_grid(.metric ~.) +
  theme_few() +
  theme(legend.position = "none")

  return(comp_plot)
}


comp_metric_plot_baseline <- purrr::map2(.x = psychopathology_pls_grid_baseline,
                                         .y = psychopathology_pls_param_baseline,
                                         ~factor_metric_plot(grid_input= .x ,
                                                             param_input = .y))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
comp_metric_plot_baseline
## $site01

## 
## $site02

## 
## $site03

## 
## $site04

## 
## $site05

## 
## $site06

## 
## $site07

## 
## $site08

## 
## $site09

## 
## $site10

## 
## $site11

## 
## $site12

## 
## $site13

## 
## $site14

## 
## $site15

## 
## $site16

## 
## $site17

## 
## $site18

## 
## $site19

## 
## $site20

## 
## $site21

Followup

## get the model grid
psychopathology_pls_grid_followup  <- purrr::map(psychopathology_pls_fit_followup,"pls_grid")
psychopathology_pls_param_followup  <- purrr::map(psychopathology_pls_fit_followup,"best_pls_model")

comp_metric_plot_followup <- purrr::map2(.x = psychopathology_pls_grid_followup,
                                         .y = psychopathology_pls_param_followup,
                                         ~factor_metric_plot(grid_input= .x ,
                                                             param_input = .y))

comp_metric_plot_followup
## $site01

## 
## $site02

## 
## $site03

## 
## $site04

## 
## $site05

## 
## $site06

## 
## $site07

## 
## $site08

## 
## $site09

## 
## $site10

## 
## $site11

## 
## $site12

## 
## $site13

## 
## $site14

## 
## $site15

## 
## $site16

## 
## $site17

## 
## $site18

## 
## $site19

## 
## $site20

## 
## $site21

5 Feature importance for the whole data set

##baseline

Model across all sites. Combine the baseline train and baseline test of the first element of the list. It will cover all of the data.

5.0.1 Baseline component bar plot

data_all_site_baseline <- rbind(psychopathology_baseline_train_select[[1]],
                                psychopathology_baseline_test_select[[1]])

### data are scaled in recipe
all_data_recipe_baseline <- recipe_prep_scale(train_input=data_all_site_baseline)

## follow the function use a more parsimonious model 
  # Now find the least complex model that has no more than a 0.1% loss of RMSE:
all_data_fit_baseline <- pls_tune(recipe_input = all_data_recipe_baseline)

all_data_wf_baseline <- all_data_fit_baseline[["pls_final_wf"]]

all_data_final_fit_baseline <- all_data_wf_baseline%>%
    parsnip::extract_spec_parsnip()%>%
    parsnip::fit(data = data_all_site_baseline, formula= as.formula("gfactor~."))

tidy_all_data_final_fit_baseline <- all_data_final_fit_baseline%>% 
  tidy()

all_data_param_baseline <-all_data_fit_baseline[["best_pls_model"]][["num_comp"]]
### extract the variance explained by each component(read the paper)

var_explained <- all_data_final_fit_baseline[["fit"]][["prop_expl_var"]][["X"]]

### plotting feature importance based on the model with all the data

comp_idx_vec <- c(1:all_data_param_baseline)

tidy_all_data_final_fit_baseline <- tidy_all_data_final_fit_baseline %>%
                                    rename(feature_names = term)
tidy_all_data_final_fit_baseline_with_name<- full_join(tidy_all_data_final_fit_baseline,
                                                       plotting_names, by = "feature_names")%>%
                                                filter(feature_names != "Y", # outcome variable col name
                                                       )
tidy_all_data_final_fit_baseline_list <- purrr::map(comp_idx_vec,
                                                    ~filter(tidy_all_data_final_fit_baseline_with_name,
                                                                         component == .)%>%
                                                          dplyr::select(all_of(c("plotting_name","value"))))

### get the variable order from the first component:

tidy_all_data_final_fit_baseline_reordered <- tidy_all_data_final_fit_baseline_list[[1]] %>% 
                                                                  arrange(value)

tidy_all_data_final_fit_baseline_reordered<- tidy_all_data_final_fit_baseline_reordered$plotting_name

pls_vi_plot_with_label <- function(data_input=tidy_all_data_final_fit_baseline_list[[1]],
                                   var_input = var_explained[1],
                                   idx_input = comp_idx_vec[1],
                                   reorder_name = tidy_all_data_final_fit_baseline_reordered){
  ### arrange the data with the character vector input
  data_input <- data_input %>%
                mutate(plotting_name = as.factor(plotting_name))%>%
                  mutate(plotting_name = factor(plotting_name,
                                                levels =reorder_name))
  ### setting up the plot titles
    range_value <- range(data_input$value)
  var_title_long <- paste0("component ", idx_input," var explained ",round(var_input,3)*100,"%")
 var_title_short <- paste0(round(var_input,3)*100,"%")
  var_title_medium <- paste0("comp ", idx_input," \n ",round(var_input,3)*100,"%")

  bar_plot <- ggplot(data_input, aes(x=.data[["value"]], y=plotting_name)) +
  geom_bar(stat="identity")+
  theme_classic() + 
    scale_x_continuous(limits = c(round(range_value[1],2)-0.05, round(range_value[2],2)+0.05),
                       breaks = c(round(range_value[1],2)-0.05,0, round(range_value[2],2)+0.05))+
    labs(title = var_title_medium)+
theme(
  axis.title.x = element_blank(),
  axis.text.x = element_text(size = 12,angle = 60,vjust = 0.5),
  axis.title.y = element_blank(),
  axis.text.y = element_text(size = 12),
  legend.text = element_blank(),
  plot.title = element_text(size=15))
  return(bar_plot)
}

## get the component one plot
comp_one_plot <- pls_vi_plot_with_label(data_input=tidy_all_data_final_fit_baseline_list[[1]],
                                   var_input = var_explained[1],
                                   idx_input = comp_idx_vec[1])



pls_vi_plot_no_label <- function(data_input=tidy_all_data_final_fit_baseline_list[[2]],
                                   var_input = var_explained[2],
                                   idx_input = comp_idx_vec[2],
                                 reorder_name = tidy_all_data_final_fit_baseline_reordered){
  ### arrange the data with the character vector input
  data_input <- data_input %>%
                mutate(plotting_name = as.factor(plotting_name))%>%
                  mutate(plotting_name = factor(plotting_name,
                                                levels =reorder_name))
    ### setting up the plot titles
  range_value <- range(data_input$value)
  
  var_title_long <- paste0("component ", idx_input," var explained ",round(var_input,3)*100,"%")
  var_title_short <- paste0(round(var_input,3)*100,"%")
    var_title_medium <- paste0("comp ", idx_input," \n ",round(var_input,3)*100,"%")

  bar_plot <- ggplot(data_input, aes(x=.data[["value"]], y=plotting_name)) +
  geom_bar(stat="identity")+
    scale_x_continuous(limits = c(round(range_value[1],2)-0.05, round(range_value[2],2)+0.05),
                       breaks = c(round(range_value[1],2)-0.05,0, round(range_value[2],2)+0.05))+
  theme_classic() + 
    labs(title = var_title_medium)+
theme(
  axis.title.x = element_blank(),
  axis.text.x = element_text(size = 12,angle = 60,vjust = 0.5),
  axis.title.y = element_blank(),
  axis.text.y = element_blank(),
  legend.text = element_blank(),
  plot.title = element_text(size=15),
  axis.ticks = element_blank())
  return(bar_plot)
}


comp_other_plots <- purrr::pmap(list(tidy_all_data_final_fit_baseline_list[2:all_data_param_baseline],
                                     var_explained[2:all_data_param_baseline],
                                     comp_idx_vec[2:all_data_param_baseline
                                                  ]),~pls_vi_plot_no_label(data_input=..1,
                                   var_input = ..2,
                                   idx_input = ..3)) 
comp_other_plots_combined <- ggpubr::ggarrange(plotlist=comp_other_plots, nrow =1,ncol = length(var_explained)-1)
## combine the plots together with carefully specified ratios
comp_plots_all <- gridExtra::grid.arrange(comp_one_plot,comp_other_plots_combined,nrow = 1, ncol = 2, widths = c(3.3, 4))

comp_plots_all
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

5.0.2 Compute correlation between features and gfactor for baseline

5.0.2.1 Computing correlation plots across sites and plotting the bar plot for baseline

The bar plot for correlations across all sites. Train and test fold are joined together.

corr_data_all <- rbind(psychopathology_baseline_train[[1]],psychopathology_baseline_test[[1]])

 corr_all_features <- purrr::map(.x = features,~cor(corr_data_all[[.x]],corr_data_all[["gfactor"]]))%>% 
                                          do.call(rbind,.)%>% as.numeric()
 
 corr_all_features_cor_test <- purrr::map(.x = features,
                            ~cor.test(corr_data_all[[.x]],corr_data_all[["gfactor"]],method="pearson"))
 
 corr_all_features_ci <- purrr::map(corr_all_features_cor_test,"conf.int")%>% 
                                          do.call(rbind,.)%>% tibble::as_tibble()%>%
                                          rename(low=V1,upp=V2)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
  corr_output_tibble <- tibble(feature_names = features,value = corr_all_features)
corr_output_tibble <- cbind(corr_output_tibble,corr_all_features_ci)
  
  corr_baseline_all_sites_names <- full_join(corr_output_tibble,plotting_names, by = "feature_names")
  
  
   corr_baseline_all_sites_names%>%
 mutate(plotting_name = fct_reorder(plotting_name, value,.fun = "max"))%>%
  ggplot(aes(x = plotting_name, y = value))+
    geom_bar(stat = "identity",fill="gray30",alpha = 0.7)+
    geom_errorbar( aes(x=plotting_name, 
                   ymin=low, 
                   ymax=upp),
               width=0.4, colour="black", alpha=0.9, linewidth=1.3)+
      coord_flip()+
    theme_classic() + 
  labs(y =paste0( "Correlation ") , x = "") +
    theme(axis.title.x= element_text(size = 20),
          axis.title.y= element_text(size = 20),
          axis.text.y = element_text(size = 15),
          axis.text.x = element_text(size = 20))

The plot without y-axis labels. Used in the aligned plot.

## use the order of baseline feature importance to order the correlation plot
## The order is the same with the magnitude of the univariate correlations
corr_baseline_all_sites_names_for_all <-corr_baseline_all_sites_names%>%
     mutate(plotting_name = as.factor(plotting_name))%>%
                  mutate(plotting_name = factor(plotting_name,
                                                levels =tidy_all_data_final_fit_baseline_reordered))
  
  
corr_bar_plot_baseline <-   corr_baseline_all_sites_names_for_all%>%
 #mutate(plotting_name = fct_reorder(plotting_name, value,.fun = "max"))%>%
  ggplot(aes(x = plotting_name, y = value))+
    geom_bar(stat = "identity",fill="gray40",alpha = 0.7)+
    geom_errorbar( aes(x=plotting_name, 
                   ymin=low, 
                   ymax=upp),
               width=0.4, colour="black", alpha=0.9, linewidth=1.3)+
  scale_y_continuous(limits = c(-0.25, 0.25),
                       breaks = c(-0.25,0, 0.25))+
      coord_flip()+
    theme_classic() + 
    labs(title = "Univariate \ncorrelations")+
theme(
  axis.title.x = element_blank(),
  axis.text.x = element_text(size = 12,angle = 60,vjust = 0.5),
  axis.title.y = element_blank(),
  axis.text.y = element_blank(),
  legend.text = element_blank(),
  plot.title = element_text(size=15),
  axis.ticks = element_blank())

5.0.3 join correlation plot with univariate together

vi_pls_plot_baseline_all <-ggpubr::ggarrange(comp_plots_all,corr_bar_plot_baseline,nrow = 1, ncol = 2, widths = c(8, 1.4)) 
vi_pls_plot_baseline_all

5.1 Followup

Use the same codes as baseline.

data_all_site_followup <- rbind(psychopathology_followup_train_select[[1]],
                                psychopathology_followup_test_select[[1]])

all_data_recipe_followup <- recipe_prep_scale(train_input=data_all_site_followup)

## follow the function use a more parsimonious model 
## cut at the number of component that does not reduce 0.1% of the RMSE
all_data_fit_followup <- pls_tune(recipe_input = all_data_recipe_followup)

all_data_wf_followup <- all_data_fit_followup[["pls_final_wf"]]

all_data_final_fit_followup <- all_data_wf_followup%>%
    parsnip::extract_spec_parsnip()%>%
    parsnip::fit(data = data_all_site_followup, formula= as.formula("gfactor~."))

tidy_all_data_final_fit_followup <- all_data_final_fit_followup%>% 
  tidy()

all_data_param_followup <-all_data_fit_followup[["best_pls_model"]][["num_comp"]]
### extract the variance explained by each component

var_explained_followup <- all_data_final_fit_followup[["fit"]][["prop_expl_var"]][["X"]]

### plotting feature importance based on the model with all the data

comp_idx_vec_followup <- c(1:all_data_param_followup)

tidy_all_data_final_fit_followup <- tidy_all_data_final_fit_followup %>%
                                    rename(feature_names = term)
tidy_all_data_final_fit_followup_with_name<- full_join(tidy_all_data_final_fit_followup,
                                                       plotting_names, by = "feature_names")%>%
                                                filter(feature_names != "Y", # outcome variable col name
                                                       )
tidy_all_data_final_fit_followup_list <- purrr::map(comp_idx_vec_followup,
                                                    ~filter(tidy_all_data_final_fit_followup_with_name,
                                                                         component == .)%>%
                                                          dplyr::select(all_of(c("plotting_name","value"))))

### get the variable order from the first component:

tidy_all_data_final_fit_followup_reordered <- tidy_all_data_final_fit_followup_list[[1]] %>% 
                                                                  arrange(value)

tidy_all_data_final_fit_followup_reordered<- tidy_all_data_final_fit_followup_reordered$plotting_name


comp_one_plot_followup <- pls_vi_plot_with_label(data_input=tidy_all_data_final_fit_followup_list[[1]],
                                   var_input = var_explained_followup[1],
                                   idx_input = comp_idx_vec_followup[1],
                                   reorder_name=tidy_all_data_final_fit_followup_reordered)


comp_other_plots_followup <- purrr::pmap(list(tidy_all_data_final_fit_followup_list[2:all_data_param_followup],
                                     var_explained_followup[2:all_data_param_followup],
                                     comp_idx_vec_followup[2:all_data_param_followup]
                                     ),~pls_vi_plot_no_label(data_input=..1,
                                   var_input = ..2,
                                   idx_input = ..3,
                                   reorder_name=tidy_all_data_final_fit_followup_reordered)) 
comp_other_plots_combined_followup <- ggpubr::ggarrange(plotlist=comp_other_plots_followup,
                                                        nrow =1,ncol = length(var_explained_followup)-1)

comp_plots_all_followup <- gridExtra::grid.arrange(comp_one_plot_followup,
                                                   comp_other_plots_combined_followup,
                                                   nrow = 1, ncol = 2, widths = c(3.3, 4))

comp_plots_all_followup
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

5.1.1 plotting univariate correlation plots across sites at followup

The bar plot for correlations across all sites. Train and test fold are joined together.

corr_data_all_followup <- rbind(psychopathology_followup_train[[1]],psychopathology_followup_test[[1]])

 corr_all_features_followup <- purrr::map(.x = features,~cor(corr_data_all_followup[[.x]],
                                                             corr_data_all_followup[["gfactor"]]))%>% 
                                          do.call(rbind,.)%>% as.numeric()
 
 corr_all_features_cor_test_followup <- purrr::map(.x = features,
                            ~cor.test(corr_data_all_followup[[.x]],
                                      corr_data_all_followup[["gfactor"]],method="pearson"))
 
 corr_all_features_ci_followup <- purrr::map(corr_all_features_cor_test_followup,"conf.int")%>% 
                                          do.call(rbind,.)%>% tibble::as_tibble()%>%
                                          rename(low=V1,upp=V2)
                                          
   
 
corr_output_tibble_followup <- tibble(feature_names = features,value = corr_all_features_followup)
corr_output_tibble_followup <- cbind(corr_output_tibble_followup,corr_all_features_ci_followup)
  
  corr_followup_all_sites_names <- full_join(corr_output_tibble_followup,
                                                      plotting_names, by = "feature_names")
  
  
   corr_followup_all_sites_names%>%
 mutate(plotting_name = fct_reorder(plotting_name, value,.fun = "max"))%>%
  ggplot(aes(x = plotting_name, y = value))+
    geom_bar(stat = "identity",fill="gray30",alpha = 0.7)+
    geom_errorbar( aes(x=plotting_name, 
                   ymin=low, 
                   ymax=upp),
               width=0.4, colour="black", alpha=0.9, linewidth=1.3)+
      coord_flip()+
    theme_classic() + 
  labs(y =paste0( "Correlation ") , x = "") +
    theme(axis.title.x= element_text(size = 20),
          axis.title.y= element_text(size = 20),
          axis.text.y = element_text(size = 15),
          axis.text.x = element_text(size = 20))

corr_followup_all_sites_names_for_all <-corr_followup_all_sites_names%>%
     mutate(plotting_name = as.factor(plotting_name))%>%
                  mutate(plotting_name = factor(plotting_name,
                                                levels =tidy_all_data_final_fit_followup_reordered))
  
  
corr_bar_plot_followup <-   corr_followup_all_sites_names_for_all%>%
 #mutate(plotting_name = fct_reorder(plotting_name, value,.fun = "max"))%>%
  ggplot(aes(x = plotting_name, y = value))+
    geom_bar(stat = "identity",fill="gray40",alpha = 0.7)+
    geom_errorbar( aes(x=plotting_name, 
                   ymin=low, 
                   ymax=upp),
               width=0.4, colour="black", alpha=0.9, linewidth=1.3)+
  scale_y_continuous(limits = c(-0.25, 0.25),
         breaks = c(-0.25,0, 0.25))+##use this because all the plots should have the same height
      coord_flip()+
    theme_classic() + 
    labs(title = "Univariate \ncorrelations")+
theme(
  axis.title.x = element_blank(),
  axis.text.x = element_text(size = 12,angle = 60,vjust = 0.5),
  axis.title.y = element_blank(),
  axis.text.y = element_blank(),
  legend.text = element_blank(),
  plot.title = element_text(size=15),
  axis.ticks = element_blank())

5.2 join correlation plot with univariate together

vi_pls_plot_followup_all <-ggpubr::ggarrange(comp_plots_all_followup,
                                                   corr_bar_plot_followup,
                                                   nrow = 1, ncol = 2, widths = c(8, 1.4)) 
vi_pls_plot_followup_all

Process the feature importance plots: align the baseline and followup plot together.

vi_pls_plot_baseline_label <- vi_pls_plot_baseline_all %>%
                            ggpubr::annotate_figure(top = ggpubr::text_grob("Baseline",size=20,hjust=2.5))


vi_pls_plot_followup_label <- vi_pls_plot_followup_all %>%
                            ggpubr::annotate_figure(top = ggpubr::text_grob("Followup",size=20,hjust=2.5))



vi_pls_plot_label <- ggpubr::ggarrange(vi_pls_plot_baseline_label,vi_pls_plot_followup_label,nrow = 2)

title_vi_pls_plot <- ggpubr::annotate_figure(vi_pls_plot_label,
                        top = ggpubr::text_grob("Feature importance of Partial Least Squares Regressions \nPredicting Cognitive Abilities from Mental Health Variables",size=25, face = "bold")) 

title_vi_pls_plot

5.3 save the output

output_list <- list(baseline_train_pred = psychopathology_pls_pred_baseline_results_train,
                    baseline_test_pred = psychopathology_pls_pred_baseline_results,
                    baseline_train_data = psychopathology_baseline_train,
                    baseline_test_data = psychopathology_baseline_test,
                    followup_train_pred = psychopathology_pls_pred_followup_results_train,
                    followup_test_pred = psychopathology_pls_pred_followup_results,
                    followup_train_data =psychopathology_followup_train ,
                    followup_test_data = psychopathology_followup_test)
saveRDS(output_list,paste0(scriptfold,'Common_psy_gene_brain_all/saved_outputs/psychopathology_pls_pred_2.0', '.RData'))

6 The performance of all three modalities individually: ASR, CBCL and personality

6.1 Parent Mental Health (ASR)

6.1.1 extract the data.

parents_baseline_train_select <- purrr::map(.x = psychopathology_baseline_train_select,
                                     ~dplyr::select(.,all_of(c(Parent_Mental_Health_names,"gfactor"))))
parents_baseline_test_select <- purrr::map(.x = psychopathology_baseline_test_select,
                                     ~dplyr::select(.,all_of(c(Parent_Mental_Health_names,"gfactor")))) 
parents_followup_train_select <- purrr::map(.x = psychopathology_followup_train_select,
                                     ~dplyr::select(.,all_of(c(Parent_Mental_Health_names,"gfactor"))))
parents_followup_test_select <- purrr::map(.x = psychopathology_followup_test_select,
                                     ~dplyr::select(.,all_of(c(Parent_Mental_Health_names,"gfactor")))) 

6.1.2 Model fitting

### fit the enet model
### baseline
parents_baseline_recipe_list <- purrr::map(.x = parents_baseline_train_select,
                             ~recipe_prep(train_input=.x,features_input = Parent_Mental_Health_names)) 

parents_pls_fit_baseline <- parents_baseline_recipe_list %>% 
                          purrr::map(.,~pls_tune(recipe_input = .,feature_input = Parent_Mental_Health_names)) 

parents_pls_fit_wf <- purrr::map(parents_pls_fit_baseline,"pls_final_wf")


parents_pls_pred_baseline <- purrr::pmap(list(parents_baseline_recipe_list,
                                        parents_pls_fit_wf,
                                        parents_baseline_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 





parents_pls_pred_baseline_results <- purrr::map(parents_pls_pred_baseline,"model_predict")


parents_pls_pred_baseline_train <- purrr::pmap(list(parents_baseline_recipe_list,
                                                         parents_pls_fit_wf,
                                                         parents_baseline_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

parents_pls_pred_baseline_results_train <- purrr::map(parents_pls_pred_baseline_train,"model_predict")

parents_baseline_metric <- purrr::map2(.x=parents_pls_pred_baseline_results,
     .y=psychopathology_baseline_test,
     ~metric_compute_site(data_input =.x ,
                    site_input = .y)) %>%
                      do.call(rbind,.)

      parents_baseline_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in baseline") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in baseline
correlation tradrsq MAE RMSE site
0.1774546 0.0253694 0.7794813 0.9783738 site01
0.1605865 0.0159658 0.7810571 0.9918116 site02
0.1342591 0.0046575 0.7860262 0.9968340 site03
0.1850618 0.0300186 0.7793983 0.9842061 site04
0.3572455 0.1170182 0.7358356 0.9383759 site05
0.2086662 0.0418865 0.7701106 0.9785722 site06
0.3509817 0.1143150 0.7326500 0.9396550 site07
0.1497906 0.0145928 0.7709818 0.9912116 site08
0.2295732 0.0526340 0.7773551 0.9720882 site09
0.2113015 0.0439401 0.7643733 0.9758199 site10
0.2590552 0.0670191 0.7535869 0.9659038 site11
0.4167393 0.1454727 0.7266680 0.9244031 site12
0.2205384 0.0468376 0.7632017 0.9756164 site13
0.0938130 -0.0183780 0.8103373 1.0083002 site14
0.1932776 0.0363187 0.7962477 0.9798977 site15
0.1110058 -0.0098377 0.7819172 1.0043736 site16
0.1374260 0.0018720 0.7806215 0.9981549 site17
0.1077463 -0.0099094 0.8010771 1.0036088 site18
0.2134093 0.0417061 0.7770648 0.9785018 site19
0.2144236 0.0435759 0.7629036 0.9772489 site20
0.3598255 0.1188617 0.7404714 0.9371271 site21
parents_baseline_metric_avg <- average_metric_one_mod(metric_list =parents_baseline_metric)

parents_baseline_metric_avg_table <- parents_baseline_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
  parents_baseline_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in baseline")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in baseline
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.214 (0.09) 0.044 (0.046) 0.77 (0.022) 0.976 (0.024)
### fit the enet model
### followup
parents_followup_recipe_list <- purrr::map(.x=parents_followup_train_select,
                                    ~recipe_prep(train_input=.x,
                                                 features_input = Parent_Mental_Health_names)) 

parents_pls_fit_followup <- parents_followup_recipe_list %>%
                           purrr::map(.,~pls_tune(recipe_input = .,feature_input = Parent_Mental_Health_names))

parents_pls_fit_wf_followup <- purrr::map(parents_pls_fit_followup,"pls_final_wf")


parents_pls_pred_followup <- purrr::pmap(list(parents_followup_recipe_list,
                                                         parents_pls_fit_wf_followup,
                                                         parents_followup_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

parents_pls_pred_followup_results <- purrr::map(parents_pls_pred_followup,"model_predict")


parents_pls_pred_followup_train <- purrr::pmap(list(parents_followup_recipe_list,
                                                         parents_pls_fit_wf_followup,
                                                         parents_followup_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

parents_pls_pred_followup_results_train <- purrr::map(parents_pls_pred_followup_train,"model_predict")


parents_followup_metric <- purrr::map2(.x=parents_pls_pred_followup_results,
     .y=psychopathology_followup_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      parents_followup_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in followup") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in followup
correlation tradrsq MAE RMSE site
0.3141114 0.0985489 0.6736215 0.8527879 site01
0.2153789 -0.0047732 0.7839466 0.9915403 site02
0.1962820 0.0321684 0.7476408 0.9436964 site03
0.1833391 0.0243764 0.7905057 1.0039083 site04
0.3495083 0.1115581 0.7121534 0.8741334 site05
0.1685226 0.0214813 0.6717714 0.8710057 site06
0.3597345 0.0829015 0.7187615 0.9085374 site07
0.1383235 -0.0080121 0.7275678 0.9248502 site08
0.1717375 0.0105191 0.7226130 0.9073779 site09
0.2253441 0.0497414 0.7156415 0.9019582 site10
0.1835246 0.0178754 0.7278881 0.9328397 site11
0.3436594 0.1060749 0.6812224 0.8599770 site12
0.1190815 -0.0287100 0.7327159 0.9263271 site13
0.0998072 -0.0230309 0.7556889 0.9633875 site14
0.3007732 0.0269021 0.8347918 1.0384278 site15
0.1267760 -0.0187616 0.7548126 0.9702302 site16
0.1355823 -0.0420926 0.7408100 0.9484422 site17
0.0752281 -0.0583869 0.7657335 0.9781323 site18
0.1899620 0.0229174 0.8235628 1.0238699 site19
0.2462298 0.0589246 0.7773010 0.9817506 site20
0.2910651 0.0802868 0.6916655 0.8734671 site21
parents_followup_metric_avg <- average_metric_one_mod(metric_list =parents_followup_metric)

parents_followup_metric_avg_table <- parents_followup_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
  parents_followup_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in followup")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in followup
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.211 (0.086) 0.027 (0.049) 0.74 (0.045) 0.937 (0.055)

6.2 CBCL sum scores

6.2.1 extract the data

CBCL_baseline_train_select <- purrr::map(.x = psychopathology_baseline_train_select,
                                     ~dplyr::select(.,all_of(c(CBCL_sum_scores_names,"gfactor"))))
CBCL_baseline_test_select <- purrr::map(.x = psychopathology_baseline_test_select,
                                     ~dplyr::select(.,all_of(c(CBCL_sum_scores_names,"gfactor")))) 
CBCL_followup_train_select <- purrr::map(.x = psychopathology_followup_train_select,
                                     ~dplyr::select(.,all_of(c(CBCL_sum_scores_names,"gfactor"))))
CBCL_followup_test_select <- purrr::map(.x = psychopathology_followup_test_select,
                                     ~dplyr::select(.,all_of(c(CBCL_sum_scores_names,"gfactor")))) 

6.2.2 model fitting

### fit the enet model
### baseline
CBCL_baseline_recipe_list <- purrr::map(.x = CBCL_baseline_train_select,
                             ~recipe_prep(train_input=.x,features_input = CBCL_sum_scores_names)) 

CBCL_pls_fit_baseline <- CBCL_baseline_recipe_list%>%
                purrr::map(.,~pls_tune(recipe_input = .,feature_input = CBCL_sum_scores_names))

CBCL_pls_fit_wf <-  purrr::map(CBCL_pls_fit_baseline,"pls_final_wf")


CBCL_pls_pred_baseline <- purrr::pmap(list(CBCL_baseline_recipe_list,
                                        CBCL_pls_fit_wf,
                                        CBCL_baseline_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 





CBCL_pls_pred_baseline_results <- purrr::map(CBCL_pls_pred_baseline,"model_predict")


CBCL_pls_pred_baseline_train <- purrr::pmap(list(CBCL_baseline_recipe_list,
                                                         CBCL_pls_fit_wf,
                                                         CBCL_baseline_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

CBCL_pls_pred_baseline_results_train <- purrr::map(CBCL_pls_pred_baseline_train,"model_predict")

CBCL_baseline_metric <- purrr::map2(.x=CBCL_pls_pred_baseline_results,
     .y=psychopathology_baseline_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      CBCL_baseline_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in baseline") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in baseline
correlation tradrsq MAE RMSE site
0.2852654 0.0806783 0.7614239 0.9502077 site01
0.1638130 0.0118078 0.7718336 0.9939048 site02
0.3193542 0.1007515 0.7519156 0.9474939 site03
0.2646696 0.0698236 0.7538343 0.9638002 site04
0.3042560 0.0924329 0.7340396 0.9513501 site05
0.2205012 0.0391954 0.7677496 0.9799455 site06
0.2934250 0.0857120 0.7520055 0.9547073 site07
0.3128640 0.0971566 0.7540734 0.9487783 site08
0.2964436 0.0876855 0.7597014 0.9539356 site09
0.2962309 0.0877278 0.7490031 0.9532117 site10
0.2814532 0.0791531 0.7558358 0.9596022 site11
0.2753091 0.0748223 0.7548884 0.9618581 site12
0.3503260 0.1182709 0.7390113 0.9383465 site13
0.1560931 0.0054219 0.7863803 0.9964483 site14
0.2793443 0.0780096 0.7839301 0.9584671 site15
0.2961880 0.0877240 0.7375309 0.9546246 site16
0.2985057 0.0891046 0.7494149 0.9535403 site17
0.2807598 0.0788090 0.7666843 0.9585132 site18
0.2677343 0.0701404 0.7714832 0.9638755 site19
0.2638427 0.0675426 0.7496668 0.9649270 site20
0.2162432 0.0442271 0.7650651 0.9760089 site21
CBCL_baseline_metric_avg <- average_metric_one_mod(metric_list =CBCL_baseline_metric)

CBCL_baseline_metric_avg_table <- CBCL_baseline_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
  CBCL_baseline_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in baseline")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in baseline
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.273 (0.048) 0.074 (0.028) 0.758 (0.014) 0.961 (0.015)
### fit the enet model
### followup
CBCL_followup_recipe_list <- purrr::map(.x=CBCL_followup_train_select,
                                        ~recipe_prep(train_input=.x,features_input = CBCL_sum_scores_names)) 

CBCL_pls_fit_followup <- CBCL_followup_recipe_list %>%
                purrr::map(.,~pls_tune(recipe_input = .,feature_input = CBCL_sum_scores_names))

CBCL_pls_fit_wf_followup <- purrr::map(CBCL_pls_fit_followup,"pls_final_wf")


CBCL_pls_pred_followup <- purrr::pmap(list(CBCL_followup_recipe_list,
                                                         CBCL_pls_fit_wf_followup,
                                                         CBCL_followup_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

CBCL_pls_pred_followup_results <- purrr::map(CBCL_pls_pred_followup,"model_predict")


CBCL_pls_pred_followup_train <- purrr::pmap(list(CBCL_followup_recipe_list,
                                                         CBCL_pls_fit_wf_followup,
                                                         CBCL_followup_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

CBCL_pls_pred_followup_results_train <-purrr::map(CBCL_pls_pred_followup_train,"model_predict")


CBCL_followup_metric <- purrr::map2(.x=CBCL_pls_pred_followup_results,
     .y=psychopathology_followup_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      CBCL_followup_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in followup") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in followup
correlation tradrsq MAE RMSE site
0.2233917 0.0431689 0.7033007 0.8785927 site01
0.2118718 -0.0088482 0.7889409 0.9935490 site02
0.2779015 0.0708403 0.7435611 0.9246504 site03
0.2043701 0.0337593 0.7818379 0.9990692 site04
0.3408640 0.1131536 0.7123041 0.8733482 site05
0.1702036 0.0008550 0.6846625 0.8801378 site06
0.2353501 0.0223710 0.7066789 0.9380410 site07
0.3322801 0.0985400 0.6893517 0.8746046 site08
0.1707499 0.0182320 0.7281113 0.9038345 site09
0.2121533 0.0390521 0.7139485 0.9070170 site10
0.3158694 0.0789481 0.7036494 0.9033702 site11
0.2802729 0.0782401 0.6911336 0.8732632 site12
0.2748151 0.0589357 0.7091351 0.8859873 site13
0.1695873 0.0137414 0.7494300 0.9459149 site14
0.2791967 -0.0022583 0.8464142 1.0538720 site15
0.2262949 0.0361840 0.7450453 0.9437035 site16
0.2460411 0.0128788 0.7226655 0.9230877 site17
0.2726952 0.0551548 0.7398666 0.9241782 site18
0.1316393 0.0043793 0.8389213 1.0335372 site19
0.2577099 0.0654529 0.7691142 0.9783394 site20
0.2315294 0.0518044 0.6934370 0.8868891 site21
CBCL_followup_metric_avg <- average_metric_one_mod(metric_list =CBCL_followup_metric)

CBCL_followup_metric_avg_table <- CBCL_followup_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
  CBCL_followup_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in followup")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in followup
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.241 (0.055) 0.042 (0.034) 0.736 (0.046) 0.93 (0.054)

6.3 Child Personality(9):

6.3.1 extract the data

child_baseline_train_select <- purrr::map(.x = psychopathology_baseline_train_select,
                                     ~dplyr::select(.,all_of(c( Child_Personality_names,"gfactor"))))
child_baseline_test_select <- purrr::map(.x = psychopathology_baseline_test_select,
                                     ~dplyr::select(.,all_of(c( Child_Personality_names,"gfactor")))) 
child_followup_train_select <- purrr::map(.x = psychopathology_followup_train_select,
                                     ~dplyr::select(.,all_of(c( Child_Personality_names,"gfactor"))))
child_followup_test_select <- purrr::map(.x = psychopathology_followup_test_select,
                                     ~dplyr::select(.,all_of(c( Child_Personality_names,"gfactor")))) 

6.3.2 model fitting

### fit the enet model
### baseline
child_baseline_recipe_list <- purrr::map(.x = child_baseline_train_select,
                             ~recipe_prep(train_input=.x,features_input = Child_Personality_names)) 

child_pls_fit_baseline <- child_baseline_recipe_list %>%
                purrr::map(.,~pls_tune(recipe_input = .,feature_input = Child_Personality_names))

child_pls_fit_wf <- purrr::map(child_pls_fit_baseline,"pls_final_wf")


child_pls_pred_baseline <- purrr::pmap(list(child_baseline_recipe_list,
                                        child_pls_fit_wf,
                                        child_baseline_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 





child_pls_pred_baseline_results <- purrr::map(child_pls_pred_baseline,"model_predict")


child_pls_pred_baseline_train <- pmap(list(child_baseline_recipe_list,
                                                         child_pls_fit_wf,
                                                         child_baseline_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

child_pls_pred_baseline_results_train <- purrr::map(child_pls_pred_baseline_train,"model_predict")

child_baseline_metric <- map2(.x=child_pls_pred_baseline_results,
     .y=psychopathology_baseline_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      child_baseline_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in baseline") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in baseline
correlation tradrsq MAE RMSE site
0.2301139 0.0503490 0.7780712 0.9657546 site01
0.2874749 0.0824662 0.7419444 0.9577124 site02
0.1467884 0.0010057 0.7965594 0.9986610 site03
0.2575813 0.0652398 0.7581326 0.9661721 site04
0.4070034 0.1547431 0.7202570 0.9181113 site05
0.2398759 0.0539322 0.7639640 0.9724013 site06
0.3167741 0.0995826 0.7478518 0.9474378 site07
0.3068713 0.0939157 0.7518646 0.9504796 site08
0.2035175 0.0353097 0.7944463 0.9809361 site09
0.2422655 0.0566274 0.7536802 0.9693236 site10
0.3332697 0.1087794 0.7426469 0.9440394 site11
0.3238142 0.1034506 0.7446165 0.9468595 site12
0.2435109 0.0566474 0.7666898 0.9705830 site13
0.2155426 0.0407826 0.7726659 0.9785745 site14
0.2746212 0.0750269 0.7840942 0.9600162 site15
0.2220486 0.0432180 0.7553823 0.9776334 site16
0.3363135 0.1113770 0.7372554 0.9418107 site17
0.2264853 0.0484188 0.7581694 0.9741956 site18
0.2572657 0.0646609 0.7638518 0.9667113 site19
0.2923401 0.0852555 0.7549576 0.9557182 site20
0.2689318 0.0721260 0.7410211 0.9616586 site21
child_baseline_metric_avg <- average_metric_one_mod(metric_list =child_baseline_metric)

child_baseline_metric_avg_table <- child_baseline_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
  child_baseline_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in baseline")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in baseline
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.268 (0.057) 0.072 (0.033) 0.758 (0.019) 0.962 (0.017)
### fit the enet model
### followup
child_followup_recipe_list <- purrr::map(.x=child_followup_train_select,~recipe_prep(train_input=.x,
                                                              features_input = Child_Personality_names)) 

child_pls_fit_followup <- child_followup_recipe_list %>%                
              purrr::map(.,~pls_tune(recipe_input = .,feature_input = Child_Personality_names))

child_pls_fit_wf_followup <- purrr::map(child_pls_fit_followup,"pls_final_wf")


child_pls_pred_followup <- pmap(list(child_followup_recipe_list,
                                                         child_pls_fit_wf_followup,
                                                         child_followup_test_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

child_pls_pred_followup_results <- purrr::map(child_pls_pred_followup,"model_predict")


child_pls_pred_followup_train <- pmap(list(child_followup_recipe_list,
                                                         child_pls_fit_wf_followup,
                                                         child_followup_train_select),~
                                                 model_final_fit(recipe_input = ..1, 
                                    wf_input = ..2,
                                    test_data = ..3)) 

child_pls_pred_followup_results_train <- purrr::map(child_pls_pred_followup_train,"model_predict")


child_followup_metric <- map2(.x=child_pls_pred_followup_results,
     .y=psychopathology_followup_test,~metric_compute_site(data_input =.x ,
                                           site_input = .y)) %>%
                      do.call(rbind,.)

      child_followup_metric%>% 
    kableExtra::kbl(caption = "metrics for all sites in followup") %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for all sites in followup
correlation tradrsq MAE RMSE site
0.4218630 0.1753951 0.6411196 0.8156294 site01
0.2920636 0.0567293 0.7565497 0.9607149 site02
0.2800944 0.0558003 0.7632825 0.9321039 site03
0.3432039 0.1113752 0.7532259 0.9581028 site04
0.4224944 0.1724779 0.6842332 0.8436319 site05
0.2164499 0.0087977 0.6891667 0.8766325 site06
0.2967338 0.0701306 0.7033264 0.9148413 site07
0.2359942 0.0453781 0.7032737 0.9000243 site08
0.2907953 0.0798290 0.6962024 0.8750216 site09
0.2937269 0.0821288 0.6872819 0.8864543 site10
0.1729606 -0.0149503 0.7340666 0.9483008 site11
0.4618552 0.1990698 0.6385183 0.8140173 site12
0.3227122 0.0854849 0.7062496 0.8734003 site13
0.2556427 0.0524753 0.7229956 0.9271541 site14
0.3416293 0.0520811 0.8449642 1.0249051 site15
0.2732442 0.0568895 0.7437904 0.9335117 site16
0.1991069 -0.0201470 0.7311221 0.9384023 site17
0.3858643 0.1320589 0.6989886 0.8857691 site18
0.3237461 0.1027530 0.7876615 0.9811494 site19
0.3561586 0.1257222 0.7484811 0.9462670 site20
0.3074633 0.0800055 0.6752093 0.8736007 site21
child_followup_metric_avg <- average_metric_one_mod(metric_list =child_followup_metric)

child_followup_metric_avg_table <- child_followup_metric_avg %>%
  mutate_if(is.numeric, round, digits=3)%>%
  mutate("correlation (sd)" = paste0(correlation," (",cor_sd,")"))%>%
  mutate("tradrsq (sd)" = paste0(tradrsq," (",rsq_sd,")"))%>%
  mutate("MAE (sd)" = paste0(MAE," (",mae_sd,")"))%>%
  mutate("RMSE (sd)" = paste0(RMSE," (",rmse_sd,")"))%>%
  select_if(is.character)
  
  child_followup_metric_avg_table%>%
    dplyr::select(all_of(avg_table_var_names))%>%
    kableExtra::kbl(caption = paste0("metrics for modalities averaged across sites in followup")) %>%
    kableExtra::kable_classic(full_width = F, 
                             html_font = "Cambria")
metrics for modalities averaged across sites in followup
correlation (sd) tradrsq (sd) MAE (sd) RMSE (sd)
0.309 (0.074) 0.081 (0.058) 0.72 (0.048) 0.91 (0.053)

7 Scatterplots

Join the predicted values of three different sets of featues of mental health together.

7.1 combine all the needed data-sets

psy_seperate_baseline <- list(parents=parents_pls_pred_baseline_results %>% do.call(rbind,.),
                              CBCL=CBCL_pls_pred_baseline_results%>% do.call(rbind,.),
                              child=child_pls_pred_baseline_results%>% do.call(rbind,.))

psy_seperate_followup <- list(parents=parents_pls_pred_followup_results %>% do.call(rbind,.),
                              CBCL=CBCL_pls_pred_followup_results%>% do.call(rbind,.),
                              child=child_pls_pred_followup_results%>% do.call(rbind,.))


baseline_corr_string_vec <- c(parents_baseline_metric_avg_table$`correlation (sd)`,
                              CBCL_baseline_metric_avg_table$`correlation (sd)`,
                              child_baseline_metric_avg_table$`correlation (sd)`)

followup_corr_string_vec <- c(parents_followup_metric_avg_table$`correlation (sd)`,
                              CBCL_followup_metric_avg_table$`correlation (sd)`,
                              child_followup_metric_avg_table$`correlation (sd)`)

plotting_titles <- c("ASR","CBCL", "Child Personality")

7.2 Plotting

psy_seperate_plot_baseline <- pmap(list( psy_seperate_baseline,plotting_titles,baseline_corr_string_vec),
                                   ~scatter_plot_gfactor_string(data_input=..1,
                                                         name_input=..2,
                                                         corr_string_input = ..3))

title_baseline <- ggdraw() + 
  draw_label(
    "Baseline",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )

psy_seperate_plot_baseline_grid <- cowplot::plot_grid(title_baseline,
                                  cowplot::plot_grid(plotlist = psy_seperate_plot_baseline,nrow = 1,ncol = 3),
                                       nrow = 2 , 
                                       rel_heights = c(0.1, 1))
## Warning: Removed 2 rows containing non-finite values (`stat_pointdensity()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 9 rows containing non-finite values (`stat_pointdensity()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
title_scatter_plot_all <- ggdraw() + 
  draw_label(
    "Out-of-Sample Predictive Ability",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )

labelled_scatter_baseline <- ggpubr::annotate_figure(psy_seperate_plot_baseline_grid,
                          left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
                        bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15)) 

cowplot::plot_grid(title_scatter_plot_all,labelled_scatter_baseline,
                                       nrow = 2 , 
                                       rel_heights = c(0.1, 1))

psy_seperate_plot_followup <- pmap(list( psy_seperate_followup, plotting_titles,followup_corr_string_vec),
                                   ~scatter_plot_gfactor_string(data_input=..1,
                                                         name_input=..2,
                                                         corr_string_input = ..3))

title_followup <- ggdraw() + 
  draw_label(
    "Followup",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )

psy_seperate_plot_followup_grid <- cowplot::plot_grid(title_followup,
                              cowplot::plot_grid(plotlist = psy_seperate_plot_followup,nrow = 1,ncol = 3),
                                       nrow = 2 , 
                                       rel_heights = c(0.1, 1))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (`stat_pointdensity()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
labelled_scatter_followup <- ggpubr::annotate_figure(psy_seperate_plot_followup_grid,
                          left= ggpubr::text_grob("Observed Cognitive Performance (Z)",size=15,rot=90),
                        bottom = ggpubr::text_grob("Predicted Cognitive Performance (Z)",size=15)) 

cowplot::plot_grid(title_scatter_plot_all,labelled_scatter_followup,
                                       nrow = 2 , 
                                       rel_heights = c(0.1, 1))

8 Save the outputs

save the metrics performances

psychopathology_baseline_metric_output_table <- psychopathology_baseline_metric_avg_table %>% 
                                                mutate(modality = "Mental Health")
psychopathology_followup_metric_output_table <- psychopathology_followup_metric_avg_table %>% 
                                                mutate(modality = "Mental Health")


parents_baseline_metric_output_table <- parents_baseline_metric_avg_table %>% 
                                                mutate(modality = "ASR")
parents_followup_metric_output_table <- parents_followup_metric_avg_table %>% 
                                                mutate(modality = "ASR")

CBCL_baseline_metric_output_table <- CBCL_baseline_metric_avg_table %>% 
                                                mutate(modality = "CBCL")
CBCL_followup_metric_output_table<- CBCL_followup_metric_avg_table %>% 
                                                mutate(modality = "CBCL")

child_baseline_metric_output_table<- child_baseline_metric_avg_table %>% 
                                                mutate(modality = "Child personality")
child_followup_metric_output_table<- child_followup_metric_avg_table %>% 
                                                mutate(modality = "Child personality")

baseline_output_table <- bind_rows(psychopathology_baseline_metric_output_table,
                                   CBCL_baseline_metric_output_table,
                                   parents_baseline_metric_output_table,
                                   child_baseline_metric_output_table) %>%
                        mutate(event = "baseline")


followup_output_table <- bind_rows(psychopathology_followup_metric_output_table,
                                   CBCL_followup_metric_output_table,
                                   parents_followup_metric_output_table,
                                   child_followup_metric_output_table) %>%
                        mutate(event = "followup")


output_table <- bind_rows(baseline_output_table,followup_output_table)
saveRDS(output_table,paste0(scriptfold,'Common_psy_gene_brain_all/saved_outputs/performance_metrics/mental_health_performance_metric', '.RData'))

save the scatterplot list

saveRDS(psy_seperate_plot_baseline,paste0(scriptfold,'Common_psy_gene_brain_all/saved_outputs/scatter_plots/psy_seperate_plot_baseline', '.RData'))
saveRDS(psy_seperate_plot_followup,paste0(scriptfold,'Common_psy_gene_brain_all/saved_outputs/scatter_plots/psy_seperate_plot_followup', '.RData'))